{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 185.1851851851852, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.018518518518518517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009821385610848665, "kl": 8.805232027953025e-06, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 268.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.037037037037037035, "frac_reward_zero_std": 0.0, "grad_norm": 3.621858596801758, "kl": 0.0004779777809744701, "learning_rate": 3e-09, "loss": 0.0384, "num_tokens": 563.0, "reward": 4.125, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.308422088623047, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.05555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 6.357028007507324, "kl": 0.0008478831732645631, "learning_rate": 6e-09, "loss": -0.2878, "num_tokens": 873.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.07407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 7.526264667510986, "kl": 0.0006631783908233047, "learning_rate": 9.000000000000001e-09, "loss": -0.0004, "num_tokens": 1114.0, "reward": 2.75, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.4433757066726685, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.09259259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 2.6715641021728516, "kl": 0.0006134712166385725, "learning_rate": 1.2e-08, "loss": -0.0587, "num_tokens": 1486.0, "reward": 0.375, "reward_std": 2.25, "rewards/reward_combined/mean": 0.375, "rewards/reward_combined/std": 2.25, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.999401569366455, "kl": 0.0002802525559673086, "learning_rate": 1.5000000000000002e-08, "loss": -0.0788, "num_tokens": 1772.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.12962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.5117316246032715, "kl": 0.0005427531723398715, "learning_rate": 1.8000000000000002e-08, "loss": 0.066, "num_tokens": 2110.0, "reward": 0.875, "reward_std": 2.0564937591552734, "rewards/reward_combined/mean": 0.875, "rewards/reward_combined/std": 2.0564937591552734, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.14814814814814814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002674127172213048, "kl": 2.959370590360777e-06, "learning_rate": 2.1e-08, "loss": 0.0, "num_tokens": 2330.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.16666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.315208435058594, "kl": 0.0001467584806960076, "learning_rate": 2.4e-08, "loss": 0.4813, "num_tokens": 2613.0, "reward": 2.25, "reward_std": 2.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 2.5, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.18518518518518517, "frac_reward_zero_std": 1.0, "grad_norm": 0.008726971223950386, "kl": 9.009348764266178e-05, "learning_rate": 2.7e-08, "loss": 0.0, "num_tokens": 2918.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.494390487670898, "kl": 0.00012619226527021965, "learning_rate": 3.0000000000000004e-08, "loss": 0.1288, "num_tokens": 3216.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.2222222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.008680141530930996, "kl": 0.00012682732995017432, "learning_rate": 3.3e-08, "loss": 0.0, "num_tokens": 3451.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005102040711790323, "clip_ratio/low_min": 0.005102040711790323, "clip_ratio/region_mean": 0.005102040711790323, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.24074074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.5816242694854736, "kl": 0.000613169715506956, "learning_rate": 3.6000000000000005e-08, "loss": -0.0895, "num_tokens": 3841.0, "reward": 3.0, "reward_std": 5.582711219787598, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 5.582711696624756, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.25925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.004939392674714327, "kl": 0.00022975738829700276, "learning_rate": 3.9e-08, "loss": 0.0, "num_tokens": 4135.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2777777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 5.921114444732666, "kl": 0.0004165449208812788, "learning_rate": 4.2e-08, "loss": 0.0782, "num_tokens": 4394.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.9938137531280518, "kl": 0.00014287605335994158, "learning_rate": 4.5e-08, "loss": 0.363, "num_tokens": 4747.0, "reward": 7.5, "reward_std": 1.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 1.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.363356828689575, "kl": 9.002560909721069e-05, "learning_rate": 4.8e-08, "loss": -0.0352, "num_tokens": 5169.0, "reward": -1.4500000476837158, "reward_std": 3.2827835083007812, "rewards/reward_combined/mean": -1.4500000476837158, "rewards/reward_combined/std": 3.2827835083007812, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 5.03240442276001, "kl": 0.00012843777949456125, "learning_rate": 5.100000000000001e-08, "loss": 0.1394, "num_tokens": 5446.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.35185185185185186, "frac_reward_zero_std": 1.0, "grad_norm": 0.014763526618480682, "kl": 0.000279415808563499, "learning_rate": 5.4e-08, "loss": 0.0, "num_tokens": 5752.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.37037037037037035, "frac_reward_zero_std": 0.0, "grad_norm": 2.9281020164489746, "kl": 1.8599247951556208e-05, "learning_rate": 5.7e-08, "loss": 0.0351, "num_tokens": 6038.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.528041839599609, "kl": 0.0005270973924780264, "learning_rate": 6.000000000000001e-08, "loss": 0.0281, "num_tokens": 6295.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.4074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.004833583254367113, "kl": 6.678203693155638e-05, "learning_rate": 6.300000000000001e-08, "loss": 0.0, "num_tokens": 6559.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.42592592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.013450962491333485, "kl": 0.00042931419739034027, "learning_rate": 6.6e-08, "loss": 0.0, "num_tokens": 6831.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.4444444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 7.01920747756958, "kl": 0.0005032640910940245, "learning_rate": 6.9e-08, "loss": 0.178, "num_tokens": 7091.0, "reward": 2.25, "reward_std": 2.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 2.5, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.46296296296296297, "frac_reward_zero_std": 1.0, "grad_norm": 2.3310747110372176e-06, "kl": -3.725290298461914e-09, "learning_rate": 7.200000000000001e-08, "loss": -0.0, "num_tokens": 7311.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 82.75, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.48148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 2.288567543029785, "kl": 0.0004660324193537235, "learning_rate": 7.500000000000001e-08, "loss": 0.4202, "num_tokens": 7870.0, "reward": 2.174999952316284, "reward_std": 1.649999976158142, "rewards/reward_combined/mean": 2.174999952316284, "rewards/reward_combined/std": 1.649999976158142, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 77.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 77.0, "completions/mean_terminated_length": 17.33333396911621, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.563615083694458, "kl": 0.0005240105965640396, "learning_rate": 7.8e-08, "loss": 0.3654, "num_tokens": 8394.0, "reward": 3.049999952316284, "reward_std": 4.439594745635986, "rewards/reward_combined/mean": 3.049999952316284, "rewards/reward_combined/std": 4.439594268798828, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.5185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 17.841129302978516, "kl": 0.0007524208049289882, "learning_rate": 8.1e-08, "loss": 0.2033, "num_tokens": 8650.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.5370370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04252630099654198, "kl": 0.0013804823393002152, "learning_rate": 8.4e-08, "loss": 0.0001, "num_tokens": 8872.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5555555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005104560405015945, "kl": 5.049820174463093e-06, "learning_rate": 8.700000000000001e-08, "loss": 0.0, "num_tokens": 9235.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5740740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 6.057951927185059, "kl": 0.00030255227466113865, "learning_rate": 9e-08, "loss": 0.1186, "num_tokens": 9551.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.5925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.524298191070557, "kl": 0.00030236503516789526, "learning_rate": 9.3e-08, "loss": 0.0886, "num_tokens": 9823.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.6111111111111112, "frac_reward_zero_std": 0.0, "grad_norm": 3.991766929626465, "kl": 0.00025091033603530377, "learning_rate": 9.6e-08, "loss": 0.0282, "num_tokens": 10088.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 79.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 79.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6296296296296297, "frac_reward_zero_std": 0.0, "grad_norm": 2.125446319580078, "kl": 0.00030589695961680263, "learning_rate": 9.900000000000001e-08, "loss": 0.4322, "num_tokens": 10632.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.6481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.04023784399032593, "kl": 0.00029347091913223267, "learning_rate": 1.0200000000000001e-07, "loss": 0.0, "num_tokens": 10842.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 89.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 89.25, "completions/mean_terminated_length": 33.66666793823242, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.945043921470642, "kl": 0.00041686483018565923, "learning_rate": 1.0500000000000001e-07, "loss": 0.4212, "num_tokens": 11427.0, "reward": 5.25, "reward_std": 5.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 5.5, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.6851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07464797049760818, "kl": 0.0011479780077934265, "learning_rate": 1.08e-07, "loss": 0.0001, "num_tokens": 11639.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.75, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.823786497116089, "kl": 0.0004799226007889956, "learning_rate": 1.11e-07, "loss": -0.0151, "num_tokens": 12054.0, "reward": 1.2999999523162842, "reward_std": 4.661187171936035, "rewards/reward_combined/mean": 1.2999999523162842, "rewards/reward_combined/std": 4.661187171936035, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7222222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.01007902156561613, "kl": 0.0002532374801376136, "learning_rate": 1.14e-07, "loss": 0.0, "num_tokens": 12403.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.7407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.023423200473189354, "kl": 0.0002661251783138141, "learning_rate": 1.17e-07, "loss": 0.0, "num_tokens": 12670.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 6.757240295410156, "kl": 0.00041234656964661554, "learning_rate": 1.2000000000000002e-07, "loss": -0.0342, "num_tokens": 12989.0, "reward": 0.625, "reward_std": 0.25, "rewards/reward_combined/mean": 0.625, "rewards/reward_combined/std": 0.25, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.7777777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.01539209857583046, "kl": 0.00012184950537630357, "learning_rate": 1.23e-07, "loss": 0.0, "num_tokens": 13285.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.000477473484352231, "kl": 1.5949416365401703e-06, "learning_rate": 1.2600000000000002e-07, "loss": 0.0, "num_tokens": 13599.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.974885940551758, "kl": 0.00047116080531850457, "learning_rate": 1.29e-07, "loss": -0.1904, "num_tokens": 13906.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.8333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.007154799532145262, "kl": 0.00013420979303191416, "learning_rate": 1.32e-07, "loss": 0.0, "num_tokens": 14174.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.8518518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.206045627593994, "kl": 0.00031978863989934325, "learning_rate": 1.35e-07, "loss": -0.0354, "num_tokens": 14464.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.8703703703703703, "frac_reward_zero_std": 1.0, "grad_norm": 0.02522652968764305, "kl": 0.00045043845602776855, "learning_rate": 1.38e-07, "loss": 0.0, "num_tokens": 14740.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8888888888888888, "frac_reward_zero_std": 0.0, "grad_norm": 3.281634569168091, "kl": 0.0002880931715480983, "learning_rate": 1.41e-07, "loss": 0.0011, "num_tokens": 15060.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.9074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.099334716796875, "kl": 0.0003625357348937541, "learning_rate": 1.4400000000000002e-07, "loss": 0.1533, "num_tokens": 15375.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.9259259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 3.123101234436035, "kl": 0.0005585844337474555, "learning_rate": 1.47e-07, "loss": 0.2408, "num_tokens": 15718.0, "reward": 4.625, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 4.308422088623047, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.9444444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 5.16160774230957, "kl": 0.0004420234326971695, "learning_rate": 1.5000000000000002e-07, "loss": 0.0612, "num_tokens": 16041.0, "reward": 4.125, "reward_std": 3.1721444129943848, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.1721444129943848, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9629629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.6957550048828125, "kl": 0.00021745844787801616, "learning_rate": 1.53e-07, "loss": 0.0535, "num_tokens": 16337.0, "reward": 5.25, "reward_std": 5.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 5.5, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.9814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 6.15764045715332, "kl": 0.00020359230984468013, "learning_rate": 1.56e-07, "loss": -0.0346, "num_tokens": 16650.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.2924599647521973, "kl": 0.0005566470790654421, "learning_rate": 1.59e-07, "loss": -0.0093, "num_tokens": 17038.0, "reward": 2.0, "reward_std": 4.6547465324401855, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 4.654747009277344, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017241379246115685, "clip_ratio/low_min": 0.017241379246115685, "clip_ratio/region_mean": 0.017241379246115685, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 1.0185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 8.021492004394531, "kl": 0.001439574727555737, "learning_rate": 1.62e-07, "loss": -0.0788, "num_tokens": 17314.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 1.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.524380922317505, "kl": 0.00041615290683694184, "learning_rate": 1.65e-07, "loss": 0.2664, "num_tokens": 17716.0, "reward": 2.375, "reward_std": 3.75, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 3.75, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.0555555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.5236964225769043, "kl": 3.65465457434766e-05, "learning_rate": 1.68e-07, "loss": 0.0292, "num_tokens": 17978.0, "reward": 3.375, "reward_std": 1.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 1.25, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.133392333984375, "kl": 0.0001687385083641857, "learning_rate": 1.71e-07, "loss": -0.0697, "num_tokens": 18249.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 1.0925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.004695798736065626, "kl": 0.0001588340091984719, "learning_rate": 1.7400000000000002e-07, "loss": 0.0, "num_tokens": 18594.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 1.1111111111111112, "frac_reward_zero_std": 0.0, "grad_norm": 5.737401485443115, "kl": 0.0004575830971589312, "learning_rate": 1.7699999999999998e-07, "loss": 0.0247, "num_tokens": 18907.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 1.1296296296296295, "frac_reward_zero_std": 0.0, "grad_norm": 2.4742252826690674, "kl": 5.735970080422703e-05, "learning_rate": 1.8e-07, "loss": 0.0259, "num_tokens": 19177.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.1481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016168957808986306, "kl": 1.7136335372924805e-05, "learning_rate": 1.83e-07, "loss": 0.0, "num_tokens": 19390.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 1.1666666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.007183755282312632, "kl": 0.0001739251489993876, "learning_rate": 1.86e-07, "loss": 0.0, "num_tokens": 19679.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 1.1851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 2.9159224033355713, "kl": 0.00013322310405783355, "learning_rate": 1.89e-07, "loss": -0.0022, "num_tokens": 19955.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.2037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.185230255126953, "kl": 0.0004092542003490962, "learning_rate": 1.92e-07, "loss": 0.15, "num_tokens": 20302.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.2222222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 1.3891133069992065, "kl": 0.00010689443297451362, "learning_rate": 1.95e-07, "loss": -0.0209, "num_tokens": 20668.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 1.2407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 3.615692138671875, "kl": 0.0004022928769700229, "learning_rate": 1.9800000000000003e-07, "loss": 0.0945, "num_tokens": 21006.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 1.2592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 3.6572940349578857, "kl": 0.0006055706035112962, "learning_rate": 2.01e-07, "loss": 0.245, "num_tokens": 21329.0, "reward": 3.875, "reward_std": 2.688710927963257, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 2.688710927963257, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.2777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.529059886932373, "kl": 0.0003714864724315703, "learning_rate": 2.0400000000000003e-07, "loss": -0.0404, "num_tokens": 21603.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.2962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007392583065666258, "kl": 1.1846422964367775e-05, "learning_rate": 2.0700000000000001e-07, "loss": 0.0, "num_tokens": 21863.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 1.3148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 3.763455629348755, "kl": 0.0002756074536591768, "learning_rate": 2.1000000000000003e-07, "loss": 0.156, "num_tokens": 22145.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3333333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.006291932892054319, "kl": 6.271153688430786e-05, "learning_rate": 2.13e-07, "loss": 0.0, "num_tokens": 22355.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.3518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.013468775898218155, "kl": 0.0001944929754245095, "learning_rate": 2.16e-07, "loss": 0.0, "num_tokens": 22619.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.3703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.007148392498493195, "kl": 0.00010193139314651489, "learning_rate": 2.19e-07, "loss": 0.0, "num_tokens": 22875.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 1.3888888888888888, "frac_reward_zero_std": 0.0, "grad_norm": 2.728374719619751, "kl": 0.00022550571520696394, "learning_rate": 2.22e-07, "loss": -0.0574, "num_tokens": 23166.0, "reward": 7.125, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 7.125, "rewards/reward_combined/std": 1.4361406564712524, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.4074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.527256011962891, "kl": 0.0014133312506601214, "learning_rate": 2.25e-07, "loss": 0.0118, "num_tokens": 23472.0, "reward": 1.75, "reward_std": 4.804512023925781, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 4.804512023925781, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.7154812812805176, "kl": 0.00011746803647838533, "learning_rate": 2.28e-07, "loss": 0.0371, "num_tokens": 23886.0, "reward": -0.9500000476837158, "reward_std": 2.245736837387085, "rewards/reward_combined/mean": -0.9500000476837158, "rewards/reward_combined/std": 2.245736837387085, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.4444444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028595319017767906, "kl": 9.809278344619088e-05, "learning_rate": 2.31e-07, "loss": 0.0, "num_tokens": 24205.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.462962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0056686620227992535, "kl": 9.486254481316791e-05, "learning_rate": 2.34e-07, "loss": 0.0, "num_tokens": 24512.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.4814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 7.017726421356201, "kl": 0.00020752509590238333, "learning_rate": 2.3700000000000002e-07, "loss": 0.4469, "num_tokens": 24827.0, "reward": 1.5, "reward_std": 2.4494898319244385, "rewards/reward_combined/mean": 1.5, "rewards/reward_combined/std": 2.4494898319244385, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 1.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.007125639356672764, "kl": 9.506940477876924e-05, "learning_rate": 2.4000000000000003e-07, "loss": 0.0, "num_tokens": 25063.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 1.5185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 3.8164546489715576, "kl": 7.238993202918209e-05, "learning_rate": 2.43e-07, "loss": 0.0508, "num_tokens": 25340.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5370370370370372, "frac_reward_zero_std": 1.0, "grad_norm": 0.00045325103565119207, "kl": 5.364418029785156e-07, "learning_rate": 2.46e-07, "loss": 0.0, "num_tokens": 25552.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.5555555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.442258358001709, "kl": 0.00023098269593901932, "learning_rate": 2.49e-07, "loss": 0.013, "num_tokens": 25923.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 0.8734862208366394, "kl": 8.921697735786438e-05, "learning_rate": 2.5200000000000003e-07, "loss": -0.0021, "num_tokens": 26235.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 1.5925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.5689330101013184, "kl": 0.0003830389105132781, "learning_rate": 2.5500000000000005e-07, "loss": 0.0298, "num_tokens": 26516.0, "reward": 5.5, "reward_std": 5.0, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 5.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.6111111111111112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018779064994305372, "kl": 5.648910428135423e-05, "learning_rate": 2.58e-07, "loss": 0.0, "num_tokens": 26806.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.6296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.028449570760130882, "kl": 0.0003179311752319336, "learning_rate": 2.6099999999999997e-07, "loss": 0.0, "num_tokens": 27022.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 1.6481481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.8877177238464355, "kl": 0.00024160796601790935, "learning_rate": 2.64e-07, "loss": 0.0515, "num_tokens": 27329.0, "reward": 5.875, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 2.462214469909668, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.6666666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.02491728961467743, "kl": 0.0009081092721316963, "learning_rate": 2.67e-07, "loss": 0.0, "num_tokens": 27685.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 90.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 35.66666793823242, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.6851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 1.6458799839019775, "kl": 0.0007071812287904322, "learning_rate": 2.7e-07, "loss": 0.5181, "num_tokens": 28264.0, "reward": 1.75, "reward_std": 2.362907886505127, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 2.362907886505127, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 1.7037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.708519458770752, "kl": 0.0002975300085381605, "learning_rate": 2.73e-07, "loss": -0.0639, "num_tokens": 28641.0, "reward": 1.25, "reward_std": 1.5, "rewards/reward_combined/mean": 1.25, "rewards/reward_combined/std": 1.5, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.7222222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.016283635050058365, "kl": 0.0003545835934346542, "learning_rate": 2.76e-07, "loss": 0.0, "num_tokens": 28975.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 83.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 83.75, "completions/mean_terminated_length": 26.33333396911621, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.7407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 1.6538562774658203, "kl": 0.00046103380736894906, "learning_rate": 2.79e-07, "loss": 0.4983, "num_tokens": 29562.0, "reward": 0.6749999523162842, "reward_std": 4.972172737121582, "rewards/reward_combined/mean": 0.6749999523162842, "rewards/reward_combined/std": 4.972172737121582, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 89.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 89.25, "completions/mean_terminated_length": 33.66666793823242, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.7592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 2.086613893508911, "kl": 0.0005300322663970292, "learning_rate": 2.82e-07, "loss": -0.0731, "num_tokens": 30147.0, "reward": 1.5, "reward_std": 1.3540064096450806, "rewards/reward_combined/mean": 1.5, "rewards/reward_combined/std": 1.3540064096450806, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 13.068719863891602, "kl": 0.00040509529645760267, "learning_rate": 2.85e-07, "loss": -0.2616, "num_tokens": 30366.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.7962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 1.969927325262688e-06, "kl": 0.0, "learning_rate": 2.8800000000000004e-07, "loss": 0.0, "num_tokens": 30586.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.8148148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.03004569560289383, "kl": 0.0005588829517364502, "learning_rate": 2.91e-07, "loss": 0.0, "num_tokens": 30830.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.8333333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.006689121946692467, "kl": 0.00021418453980004415, "learning_rate": 2.94e-07, "loss": 0.0, "num_tokens": 31121.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.8518518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 2.5398921966552734, "kl": 0.00028973876032978296, "learning_rate": 2.97e-07, "loss": -0.0279, "num_tokens": 31479.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.8703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.008573658764362335, "kl": 0.00012734341544273775, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "num_tokens": 31761.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 1.8888888888888888, "frac_reward_zero_std": 0.0, "grad_norm": 6.150824069976807, "kl": 0.0006834420200902969, "learning_rate": 3.0300000000000005e-07, "loss": 0.0776, "num_tokens": 32063.0, "reward": 5.5, "reward_std": 5.0, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 5.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 1.9074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 6.890849590301514, "kl": 0.00020996305920562008, "learning_rate": 3.06e-07, "loss": 0.1167, "num_tokens": 32384.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 10.60103702545166, "kl": 8.521063136868179e-05, "learning_rate": 3.09e-07, "loss": 0.1945, "num_tokens": 32625.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.9444444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.048797205090522766, "kl": 0.0004254840314388275, "learning_rate": 3.12e-07, "loss": 0.0, "num_tokens": 32879.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.9629629629629628, "frac_reward_zero_std": 0.0, "grad_norm": 3.783179998397827, "kl": 0.00042832360486499965, "learning_rate": 3.15e-07, "loss": 0.1537, "num_tokens": 33160.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.9814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 4.766118049621582, "kl": 0.0008387027846765704, "learning_rate": 3.18e-07, "loss": -0.0625, "num_tokens": 33450.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.873804569244385, "kl": 0.0007832607952877879, "learning_rate": 3.21e-07, "loss": 0.2217, "num_tokens": 33828.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.0185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 4.612246513366699, "kl": 0.000577682916627964, "learning_rate": 3.24e-07, "loss": -0.0154, "num_tokens": 34133.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 2.037037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01946515031158924, "kl": 0.0006353051285259426, "learning_rate": 3.27e-07, "loss": 0.0, "num_tokens": 34470.0, "reward": 0.5, "reward_std": 0.0, "rewards/reward_combined/mean": 0.5, "rewards/reward_combined/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.0555555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 1.6852893829345703, "kl": 0.0001253573518624762, "learning_rate": 3.3e-07, "loss": -0.0299, "num_tokens": 34887.0, "reward": 0.25, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 0.25, "rewards/reward_combined/std": 0.28867512941360474, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.2597243785858154, "kl": 0.00020556408708216622, "learning_rate": 3.3300000000000003e-07, "loss": -0.0082, "num_tokens": 35210.0, "reward": 5.625, "reward_std": 2.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 2.75, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 2.0925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.017365755513310432, "kl": 0.00045232250704430044, "learning_rate": 3.36e-07, "loss": 0.0, "num_tokens": 35421.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 85.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 85.25, "completions/mean_terminated_length": 28.33333396911621, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 2.111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.1956639289855957, "kl": 0.0004044400993734598, "learning_rate": 3.39e-07, "loss": 0.2955, "num_tokens": 35978.0, "reward": 0.675000011920929, "reward_std": 2.599198579788208, "rewards/reward_combined/mean": 0.675000011920929, "rewards/reward_combined/std": 2.599198579788208, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.1296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 3.4492805004119873, "kl": 0.00033524764876347035, "learning_rate": 3.42e-07, "loss": 0.1065, "num_tokens": 36255.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 2.148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.7756409645080566, "kl": 0.00026651281223166734, "learning_rate": 3.4500000000000003e-07, "loss": 0.053, "num_tokens": 36573.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007692307699471712, "clip_ratio/low_min": 0.007692307699471712, "clip_ratio/region_mean": 0.007692307699471712, "completion_length": 49.25, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 2.1666666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 3.275442361831665, "kl": 0.0005813548341393471, "learning_rate": 3.4800000000000005e-07, "loss": -0.0847, "num_tokens": 37022.0, "reward": 4.125, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.190763473510742, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 149.75, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 43.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.7361387014389038, "kl": 0.0004337559803389013, "learning_rate": 3.51e-07, "loss": 0.1386, "num_tokens": 37845.0, "reward": -0.8250000476837158, "reward_std": 5.578754425048828, "rewards/reward_combined/mean": -0.8250000476837158, "rewards/reward_combined/std": 5.57875394821167, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 2.2037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.416597366333008, "kl": 0.00023660504666622728, "learning_rate": 3.5399999999999997e-07, "loss": 0.038, "num_tokens": 38140.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.2222222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.010696391575038433, "kl": 0.00010671019845176488, "learning_rate": 3.57e-07, "loss": 0.0, "num_tokens": 38400.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 2.240740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 3.6675543785095215, "kl": 0.00046405295142903924, "learning_rate": 3.6e-07, "loss": 0.0599, "num_tokens": 38732.0, "reward": 3.25, "reward_std": 3.0686588287353516, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 3.0686588287353516, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 2.259259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.07305122166872025, "kl": 0.0010722950100898743, "learning_rate": 3.63e-07, "loss": 0.0001, "num_tokens": 38944.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 2.2777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 12.21618938446045, "kl": 0.0008814340108074248, "learning_rate": 3.66e-07, "loss": 0.2721, "num_tokens": 39170.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.2962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.745265007019043, "kl": 0.00030546652851626277, "learning_rate": 3.69e-07, "loss": 0.0257, "num_tokens": 39530.0, "reward": 2.0, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 1.7320507764816284, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 2.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.3485212326049805, "kl": 0.0005984032613923773, "learning_rate": 3.72e-07, "loss": 0.0998, "num_tokens": 39828.0, "reward": 1.75, "reward_std": 4.27200174331665, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 4.27200174331665, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.3333333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 1.9052869081497192, "kl": 0.00010325404946343042, "learning_rate": 3.75e-07, "loss": 0.4518, "num_tokens": 40355.0, "reward": 5.800000190734863, "reward_std": 4.400000095367432, "rewards/reward_combined/mean": 5.800000190734863, "rewards/reward_combined/std": 4.400000095367432, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 2.351851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00480230525135994, "kl": 5.0453531002858654e-05, "learning_rate": 3.78e-07, "loss": 0.0, "num_tokens": 40681.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 90.5, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 90.5, "completions/mean_terminated_length": 90.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.3703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 1.8034842014312744, "kl": 0.0003667814744403586, "learning_rate": 3.8100000000000004e-07, "loss": 0.3378, "num_tokens": 41263.0, "reward": 3.299999952316284, "reward_std": 4.982635974884033, "rewards/reward_combined/mean": 3.299999952316284, "rewards/reward_combined/std": 4.982636451721191, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.388888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015238815685734153, "kl": 2.2547319531440735e-05, "learning_rate": 3.84e-07, "loss": 0.0, "num_tokens": 41507.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.4074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.311396598815918, "kl": 0.0004804107011295855, "learning_rate": 3.87e-07, "loss": 0.0226, "num_tokens": 41839.0, "reward": 3.75, "reward_std": 3.1224989891052246, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 3.1224989891052246, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.007267743814736605, "kl": 0.00011053532580262981, "learning_rate": 3.9e-07, "loss": 0.0, "num_tokens": 42109.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.4444444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.01588411070406437, "kl": 0.0002482764102751389, "learning_rate": 3.9300000000000004e-07, "loss": 0.0, "num_tokens": 42373.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453706741333008, "kl": 0.0002344980457564816, "learning_rate": 3.9600000000000005e-07, "loss": 0.0328, "num_tokens": 42665.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0035197327379137278, "clip_ratio/low_min": 0.0035197327379137278, "clip_ratio/region_mean": 0.0035197327379137278, "completion_length": 142.25, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 28.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.4814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 1.361878514289856, "kl": 0.0006233485473785549, "learning_rate": 3.99e-07, "loss": 0.6507, "num_tokens": 43458.0, "reward": 1.6749999523162842, "reward_std": 4.897873878479004, "rewards/reward_combined/mean": 1.6749999523162842, "rewards/reward_combined/std": 4.897873878479004, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.012103725224733353, "kl": 0.0004016427083115559, "learning_rate": 4.02e-07, "loss": 0.0, "num_tokens": 43736.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.5185185185185186, "frac_reward_zero_std": 1.0, "grad_norm": 0.021656809374690056, "kl": 0.00031200129160424694, "learning_rate": 4.0500000000000004e-07, "loss": 0.0, "num_tokens": 44001.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 2.537037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.230462551116943, "kl": 0.0003037539281649515, "learning_rate": 4.0800000000000005e-07, "loss": 0.2245, "num_tokens": 44278.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 2.5555555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.006961924955248833, "kl": 6.836801912868395e-05, "learning_rate": 4.1100000000000007e-07, "loss": 0.0, "num_tokens": 44515.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 2.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 6.957086563110352, "kl": 0.00036130990338278934, "learning_rate": 4.1400000000000003e-07, "loss": 0.1463, "num_tokens": 44792.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 2.5925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 6.180386844789609e-05, "kl": 4.3958425521850586e-07, "learning_rate": 4.1700000000000004e-07, "loss": 0.0, "num_tokens": 45004.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.611111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.019667502492666245, "kl": 0.00045022181802778505, "learning_rate": 4.2000000000000006e-07, "loss": 0.0, "num_tokens": 45294.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 2.6296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002667341032065451, "kl": 4.52027666142385e-06, "learning_rate": 4.2299999999999996e-07, "loss": 0.0, "num_tokens": 45571.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.648148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.808579444885254, "kl": 0.0011022969265468419, "learning_rate": 4.26e-07, "loss": 0.1467, "num_tokens": 45917.0, "reward": 2.25, "reward_std": 2.020725965499878, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 2.020725965499878, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.6666666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 5.796252250671387, "kl": 5.559250591602449e-05, "learning_rate": 4.29e-07, "loss": 0.0009, "num_tokens": 46183.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 144 }, { "clip_ratio/high_max": 0.009999999776482582, "clip_ratio/high_mean": 0.009999999776482582, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009999999776482582, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.496193885803223, "kl": 0.0005811657465528697, "learning_rate": 4.32e-07, "loss": -0.0265, "num_tokens": 46515.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.7037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.0275633335113525, "kl": 0.0004115433621336706, "learning_rate": 4.3499999999999996e-07, "loss": -0.028, "num_tokens": 46828.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.7222222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039161560125648975, "kl": 5.484670327859931e-05, "learning_rate": 4.38e-07, "loss": 0.0, "num_tokens": 47084.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.7407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002509040350560099, "kl": 3.664294965233239e-06, "learning_rate": 4.41e-07, "loss": 0.0, "num_tokens": 47449.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 2.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 6.516165256500244, "kl": 0.0008194116380764171, "learning_rate": 4.44e-07, "loss": -0.1382, "num_tokens": 47737.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.7777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 6.077332973480225, "kl": 0.0009726728312671185, "learning_rate": 4.4699999999999997e-07, "loss": 0.1408, "num_tokens": 48024.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 2.7962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.980436325073242, "kl": 0.0003069709346164018, "learning_rate": 4.5e-07, "loss": -0.0582, "num_tokens": 48319.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.034146308898926, "kl": 0.0006649124989053234, "learning_rate": 4.53e-07, "loss": 0.0705, "num_tokens": 48647.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.8333333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 2.877814531326294, "kl": 0.0003990626319136936, "learning_rate": 4.56e-07, "loss": 0.0147, "num_tokens": 48991.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 2.851851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.5918796062469482, "kl": 0.00011902215555892326, "learning_rate": 4.59e-07, "loss": 0.0547, "num_tokens": 49290.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.8703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.005021293181926012, "kl": 8.40714678815857e-05, "learning_rate": 4.62e-07, "loss": 0.0, "num_tokens": 49598.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 2.888888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.039240170270204544, "kl": 0.0008037164807319641, "learning_rate": 4.65e-07, "loss": 0.0, "num_tokens": 49816.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 2.9074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026107507292181253, "kl": 0.00021249623387120664, "learning_rate": 4.68e-07, "loss": 0.0, "num_tokens": 50050.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006980568869039416, "kl": 3.6558136343955994e-05, "learning_rate": 4.71e-07, "loss": 0.0, "num_tokens": 50310.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 2.9444444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.000600357714574784, "kl": 1.3470649946611957e-05, "learning_rate": 4.7400000000000004e-07, "loss": 0.0, "num_tokens": 50530.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 2.962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.563671350479126, "kl": 0.0005213647673372179, "learning_rate": 4.77e-07, "loss": 0.0744, "num_tokens": 50865.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 75.25, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.9814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 2.941039800643921, "kl": 0.00040402429658570327, "learning_rate": 4.800000000000001e-07, "loss": 0.4818, "num_tokens": 51382.0, "reward": 2.549999952316284, "reward_std": 3.652852773666382, "rewards/reward_combined/mean": 2.549999952316284, "rewards/reward_combined/std": 3.6528525352478027, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008686037617735565, "kl": 1.1232991596443753e-05, "learning_rate": 4.830000000000001e-07, "loss": 0.0, "num_tokens": 51694.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 3.0185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 5.922219276428223, "kl": 0.0002770378050627187, "learning_rate": 4.86e-07, "loss": 0.1201, "num_tokens": 51959.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 3.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.320078372955322, "kl": 0.0008652110700495541, "learning_rate": 4.89e-07, "loss": 0.0749, "num_tokens": 52318.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 164 }, { "clip_ratio/high_max": 0.001805054140277207, "clip_ratio/high_mean": 0.001805054140277207, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001805054140277207, "completion_length": 80.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 80.75, "completions/mean_terminated_length": 22.33333396911621, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.0555555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.327766180038452, "kl": 0.0004794775159098208, "learning_rate": 4.92e-07, "loss": 0.4238, "num_tokens": 52861.0, "reward": 7.300000190734863, "reward_std": 0.40000009536743164, "rewards/reward_combined/mean": 7.300000190734863, "rewards/reward_combined/std": 0.40000009536743164, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 71.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 71.0, "completions/mean_terminated_length": 9.333333969116211, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.9788551330566406, "kl": 0.00014810793072683737, "learning_rate": 4.95e-07, "loss": 0.425, "num_tokens": 53369.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.0925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.228071212768555, "kl": 0.00040546040690969676, "learning_rate": 4.98e-07, "loss": 0.0601, "num_tokens": 53701.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 3.111111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003313072375021875, "kl": 7.852911949157715e-06, "learning_rate": 5.01e-07, "loss": 0.0, "num_tokens": 53921.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.1296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 6.729984283447266, "kl": 0.00021809947793371975, "learning_rate": 5.040000000000001e-07, "loss": -0.0033, "num_tokens": 54212.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.148148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.005163852591067553, "kl": 8.982420331449248e-05, "learning_rate": 5.070000000000001e-07, "loss": 0.0, "num_tokens": 54472.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 3.1666666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.001008315128274262, "kl": 7.666647434234619e-06, "learning_rate": 5.100000000000001e-07, "loss": 0.0, "num_tokens": 54708.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.017396019771695137, "kl": 0.0002985633864227566, "learning_rate": 5.13e-07, "loss": 0.0, "num_tokens": 54996.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.2037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.3503596782684326, "kl": 0.0009463485621381551, "learning_rate": 5.16e-07, "loss": -0.0942, "num_tokens": 55333.0, "reward": 0.42500001192092896, "reward_std": 0.15000000596046448, "rewards/reward_combined/mean": 0.42500001192092896, "rewards/reward_combined/std": 0.14999999105930328, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.2222222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.0455613136291504, "kl": 0.00022622020333074033, "learning_rate": 5.189999999999999e-07, "loss": 0.0367, "num_tokens": 55621.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 3.240740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.021082421764731407, "kl": 0.00023952528135851026, "learning_rate": 5.219999999999999e-07, "loss": 0.0, "num_tokens": 55872.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 3.259259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 3.6618916988372803, "kl": 3.9831074900575913e-05, "learning_rate": 5.25e-07, "loss": 0.0027, "num_tokens": 56198.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.2777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.9709019660949707, "kl": 0.00026796314341481775, "learning_rate": 5.28e-07, "loss": 0.0813, "num_tokens": 56473.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.2962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.1422834396362305, "kl": 0.00045865429274272174, "learning_rate": 5.31e-07, "loss": -0.1223, "num_tokens": 56796.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.25, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.48903226852417, "kl": 0.0006679217622149736, "learning_rate": 5.34e-07, "loss": 0.111, "num_tokens": 57209.0, "reward": -1.25, "reward_std": 2.0615527629852295, "rewards/reward_combined/mean": -1.25, "rewards/reward_combined/std": 2.0615527629852295, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.013888888992369175, "clip_ratio/region_mean": 0.013888888992369175, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.3333333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 4.716423988342285, "kl": 0.0008695235010236502, "learning_rate": 5.37e-07, "loss": -0.0669, "num_tokens": 57518.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.351851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.7347817420959473, "kl": 0.00016060109555837698, "learning_rate": 5.4e-07, "loss": -0.0739, "num_tokens": 57794.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 3.3703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017193271778523922, "kl": 3.500526327115949e-05, "learning_rate": 5.43e-07, "loss": 0.0, "num_tokens": 58043.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.388888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.462109327316284, "kl": 0.0006443507445510477, "learning_rate": 5.46e-07, "loss": 0.2934, "num_tokens": 58431.0, "reward": 1.25, "reward_std": 4.573474407196045, "rewards/reward_combined/mean": 1.25, "rewards/reward_combined/std": 4.573474407196045, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.4074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.9230544567108154, "kl": 5.265841173240915e-05, "learning_rate": 5.49e-07, "loss": 0.0355, "num_tokens": 58721.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.5152268409729, "kl": 0.0005125022289576009, "learning_rate": 5.52e-07, "loss": 0.1367, "num_tokens": 59055.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.4444444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 3.8972930908203125, "kl": 0.0002818711072904989, "learning_rate": 5.55e-07, "loss": 0.0262, "num_tokens": 59353.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 84.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 84.0, "completions/mean_terminated_length": 26.666667938232422, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 3.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.7512755393981934, "kl": 0.0004527562850853428, "learning_rate": 5.58e-07, "loss": 0.2744, "num_tokens": 59905.0, "reward": 3.5, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 3.674234628677368, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.4814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 6.72933292388916, "kl": 0.0010080479551106691, "learning_rate": 5.61e-07, "loss": 0.1277, "num_tokens": 60182.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.7610340118408203, "kl": 0.0001639571419218555, "learning_rate": 5.64e-07, "loss": 0.0005, "num_tokens": 60453.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.5185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 4.874295711517334, "kl": 0.0006885581533424556, "learning_rate": 5.67e-07, "loss": 0.0965, "num_tokens": 60785.0, "reward": 2.375, "reward_std": 3.8810436725616455, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 3.8810436725616455, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.537037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004016809107270092, "kl": 3.1813151508686133e-06, "learning_rate": 5.7e-07, "loss": 0.0, "num_tokens": 61096.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.5555555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 4.614760398864746, "kl": 0.00044257061745156534, "learning_rate": 5.730000000000001e-07, "loss": 0.1746, "num_tokens": 61399.0, "reward": 5.25, "reward_std": 5.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 5.5, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 3.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.004102109931409359, "kl": 2.244114875793457e-05, "learning_rate": 5.760000000000001e-07, "loss": 0.0, "num_tokens": 61607.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 3.5925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020067833829671144, "kl": 7.15915666660294e-05, "learning_rate": 5.79e-07, "loss": 0.0, "num_tokens": 61895.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 82.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.611111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.740919351577759, "kl": 0.0005280704936012626, "learning_rate": 5.82e-07, "loss": 0.1512, "num_tokens": 62451.0, "reward": 1.7999999523162842, "reward_std": 4.725110054016113, "rewards/reward_combined/mean": 1.7999999523162842, "rewards/reward_combined/std": 4.725110054016113, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.6296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 1.941718578338623, "kl": 0.0002683470520423725, "learning_rate": 5.85e-07, "loss": 0.0117, "num_tokens": 62858.0, "reward": 0.925000011920929, "reward_std": 1.4338176250457764, "rewards/reward_combined/mean": 0.925000011920929, "rewards/reward_combined/std": 1.433817744255066, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.648148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.010845618322491646, "kl": 0.00014817271221545525, "learning_rate": 5.88e-07, "loss": 0.0, "num_tokens": 63124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 3.6666666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.010536052286624908, "kl": 0.00016219168901443481, "learning_rate": 5.91e-07, "loss": 0.0, "num_tokens": 63336.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.685185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 7.782825559843332e-05, "kl": 5.132135243002267e-06, "learning_rate": 5.94e-07, "loss": 0.0, "num_tokens": 63644.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 3.7037037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017704206402413547, "kl": 4.967053683913036e-07, "learning_rate": 5.970000000000001e-07, "loss": 0.0, "num_tokens": 63860.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.7222222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 8.628633077023551e-05, "kl": 3.933285597668146e-06, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "num_tokens": 64224.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.8029863834381104, "kl": 0.0006761455442756414, "learning_rate": 6.030000000000001e-07, "loss": 0.0419, "num_tokens": 64550.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 68.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 68.5, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 3.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.2645158767700195, "kl": 0.00014356183964991942, "learning_rate": 6.060000000000001e-07, "loss": 0.4782, "num_tokens": 65032.0, "reward": 2.924999952316284, "reward_std": 2.1500000953674316, "rewards/reward_combined/mean": 2.924999952316284, "rewards/reward_combined/std": 2.1500000953674316, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.7777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 6.375810146331787, "kl": 0.0004472557484405115, "learning_rate": 6.09e-07, "loss": 0.0486, "num_tokens": 65339.0, "reward": 1.125, "reward_std": 1.8427786827087402, "rewards/reward_combined/mean": 1.125, "rewards/reward_combined/std": 1.8427786827087402, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 3.7962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016529974527657032, "kl": 9.95248547042138e-06, "learning_rate": 6.12e-07, "loss": 0.0, "num_tokens": 65558.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.2477164268493652, "kl": 0.00011237839862587862, "learning_rate": 6.149999999999999e-07, "loss": 0.0127, "num_tokens": 65878.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.8333333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 2.262871265411377, "kl": 0.00013070139539195225, "learning_rate": 6.18e-07, "loss": 0.0289, "num_tokens": 66192.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010668719187378883, "kl": 0.0002653169158293167, "learning_rate": 6.21e-07, "loss": 0.0, "num_tokens": 66463.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.8703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 2.6986334323883057, "kl": 2.062311068584677e-05, "learning_rate": 6.24e-07, "loss": 0.0016, "num_tokens": 66740.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.502748966217041, "kl": 0.0008518050017300993, "learning_rate": 6.27e-07, "loss": 0.0399, "num_tokens": 67051.0, "reward": 3.0, "reward_std": 3.316624879837036, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 3.316624879837036, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.9074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.010843516327440739, "kl": 0.000132754968944937, "learning_rate": 6.3e-07, "loss": 0.0, "num_tokens": 67351.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00037427424103952944, "kl": 1.382132359140087e-05, "learning_rate": 6.33e-07, "loss": 0.0, "num_tokens": 67613.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.9444444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.008297150954604149, "kl": 0.00022291956702247262, "learning_rate": 6.36e-07, "loss": 0.0, "num_tokens": 67886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.821705341339111, "kl": 0.0011893765986314975, "learning_rate": 6.39e-07, "loss": 0.0412, "num_tokens": 68144.0, "reward": 2.5, "reward_std": 2.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 2.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.9814814814814814, "frac_reward_zero_std": 1.0, "grad_norm": 0.014322387985885143, "kl": 0.0002677934747055133, "learning_rate": 6.42e-07, "loss": 0.0, "num_tokens": 68442.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 98.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 98.25, "completions/mean_terminated_length": 45.66666793823242, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.120150089263916, "kl": 0.00032934667251538485, "learning_rate": 6.45e-07, "loss": 0.0681, "num_tokens": 69071.0, "reward": 1.2999999523162842, "reward_std": 1.9646884202957153, "rewards/reward_combined/mean": 1.2999999523162842, "rewards/reward_combined/std": 1.9646883010864258, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.018518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 4.7130351066589355, "kl": 0.00020774344011442736, "learning_rate": 6.48e-07, "loss": 0.2083, "num_tokens": 69388.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.293214797973633, "kl": 0.00045467575546354055, "learning_rate": 6.51e-07, "loss": 0.1662, "num_tokens": 69699.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 4.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022832085378468037, "kl": 2.7919808985643613e-05, "learning_rate": 6.54e-07, "loss": 0.0, "num_tokens": 69915.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.135390281677246, "kl": 0.0006713748298352584, "learning_rate": 6.57e-07, "loss": -0.0096, "num_tokens": 70218.0, "reward": 5.875, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 2.462214469909668, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 4.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008521368145011365, "kl": 1.3694167137145996e-05, "learning_rate": 6.6e-07, "loss": 0.0, "num_tokens": 70462.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 8.929198265075684, "kl": 0.0007695133681409061, "learning_rate": 6.63e-07, "loss": 0.0979, "num_tokens": 70700.0, "reward": 2.5, "reward_std": 3.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 4.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013419704046100378, "kl": 2.1379492409323575e-05, "learning_rate": 6.660000000000001e-07, "loss": 0.0, "num_tokens": 71020.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 4.148148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02378317527472973, "kl": 0.00020574219524860382, "learning_rate": 6.690000000000001e-07, "loss": 0.0, "num_tokens": 71280.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.166666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.6256306171417236, "kl": 0.0004987402644474059, "learning_rate": 6.72e-07, "loss": -0.0001, "num_tokens": 71611.0, "reward": 1.75, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.4433757066726685, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00030701482319273055, "kl": 9.885265171760693e-06, "learning_rate": 6.75e-07, "loss": 0.0, "num_tokens": 71923.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.203703703703703, "frac_reward_zero_std": 0.0, "grad_norm": 5.001032829284668, "kl": 0.0008387054549530149, "learning_rate": 6.78e-07, "loss": 0.3488, "num_tokens": 72240.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.222222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.3922600746154785, "kl": 0.0004214024083921686, "learning_rate": 6.81e-07, "loss": 0.1584, "num_tokens": 72553.0, "reward": 2.5, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.34165620803833, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.2407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.5442514419555664, "kl": 0.0010614169004838914, "learning_rate": 6.84e-07, "loss": -0.0212, "num_tokens": 72900.0, "reward": 1.875, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 1.6007810831069946, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.2592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.6496753692626953, "kl": 0.0005403376708272845, "learning_rate": 6.87e-07, "loss": 0.1478, "num_tokens": 73236.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.277777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 1.4552295207977295, "kl": 0.00028668949380517006, "learning_rate": 6.900000000000001e-07, "loss": -0.2265, "num_tokens": 73604.0, "reward": 4.125, "reward_std": 2.25, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 2.25, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 4.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.199924945831299, "kl": 0.00014511148037854582, "learning_rate": 6.930000000000001e-07, "loss": 0.0255, "num_tokens": 73865.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.181790828704834, "kl": 0.0004374007985461503, "learning_rate": 6.960000000000001e-07, "loss": 0.0313, "num_tokens": 74169.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.676959276199341, "kl": 0.001127946306951344, "learning_rate": 6.990000000000001e-07, "loss": 0.0259, "num_tokens": 74450.0, "reward": 5.25, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 3.4034297466278076, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.351851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.054653167724609, "kl": 0.00033280889329034835, "learning_rate": 7.02e-07, "loss": 0.2928, "num_tokens": 74755.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.008404637686908245, "kl": 0.00020041676543769427, "learning_rate": 7.05e-07, "loss": 0.0, "num_tokens": 75119.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.388888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 6.890386581420898, "kl": 0.0002970975074276794, "learning_rate": 7.079999999999999e-07, "loss": 0.178, "num_tokens": 75390.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 4.346397327026352e-05, "kl": 1.4379620552062988e-06, "learning_rate": 7.11e-07, "loss": 0.0, "num_tokens": 75610.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011283610947430134, "kl": 2.702176516322652e-05, "learning_rate": 7.14e-07, "loss": 0.0, "num_tokens": 75870.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.01615816354751587, "kl": 0.0001490861177444458, "learning_rate": 7.17e-07, "loss": 0.0, "num_tokens": 76126.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.25, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.8408299684524536, "kl": 0.00019194966807845049, "learning_rate": 7.2e-07, "loss": -0.068, "num_tokens": 76531.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 4.481481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.039353568106889725, "kl": 0.0008344904235855211, "learning_rate": 7.23e-07, "loss": 0.0, "num_tokens": 76799.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 88.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 88.25, "completions/mean_terminated_length": 32.333335876464844, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 4.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.2978901863098145, "kl": 0.0006689954025205225, "learning_rate": 7.26e-07, "loss": 0.4072, "num_tokens": 77404.0, "reward": 0.42500001192092896, "reward_std": 0.5377421379089355, "rewards/reward_combined/mean": 0.42500001192092896, "rewards/reward_combined/std": 0.5377421975135803, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.518518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 1.2825461626052856, "kl": 9.950984167517163e-05, "learning_rate": 7.29e-07, "loss": -0.0702, "num_tokens": 77827.0, "reward": 0.75, "reward_std": 1.5, "rewards/reward_combined/mean": 0.75, "rewards/reward_combined/std": 1.5, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 4.537037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018169282702729106, "kl": 2.6743327453004895e-05, "learning_rate": 7.32e-07, "loss": 0.0, "num_tokens": 78061.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 3.9174177646636963, "kl": 0.0008164856844814494, "learning_rate": 7.350000000000001e-07, "loss": 0.0358, "num_tokens": 78343.0, "reward": 7.0, "reward_std": 2.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 2.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 4.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.0350708961486816, "kl": 0.0002487677338649519, "learning_rate": 7.38e-07, "loss": 0.1144, "num_tokens": 78694.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 2.315531015396118, "kl": 0.00017569374904269353, "learning_rate": 7.41e-07, "loss": -0.013, "num_tokens": 79002.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.611111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 5.840476036071777, "kl": 0.00015407590763061307, "learning_rate": 7.44e-07, "loss": 0.1457, "num_tokens": 79279.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007033797446638346, "kl": 0.00014726072549819946, "learning_rate": 7.47e-07, "loss": 0.0, "num_tokens": 79495.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 71.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 71.75, "completions/mean_terminated_length": 10.333333969116211, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.648148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.511259078979492, "kl": 0.0008783047087490559, "learning_rate": 7.5e-07, "loss": 0.4654, "num_tokens": 80010.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.666666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.017466451972723007, "kl": 0.0004449118932825513, "learning_rate": 7.53e-07, "loss": 0.0, "num_tokens": 80278.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.481826305389404, "kl": 0.00036090027424506843, "learning_rate": 7.56e-07, "loss": -0.0047, "num_tokens": 80566.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 4.703703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010092331795021892, "kl": 2.580881118774414e-05, "learning_rate": 7.590000000000001e-07, "loss": 0.0, "num_tokens": 80774.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.722222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 7.845901966094971, "kl": 0.000812845813925378, "learning_rate": 7.620000000000001e-07, "loss": 0.0011, "num_tokens": 81049.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 255 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 4.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.7894251346588135, "kl": 0.00016434883582405746, "learning_rate": 7.65e-07, "loss": 0.029, "num_tokens": 81388.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 4.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 4.973078727722168, "kl": 0.00030741008231416345, "learning_rate": 7.68e-07, "loss": 0.0279, "num_tokens": 81684.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.777777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.000896158569958061, "kl": 1.6787877029855736e-05, "learning_rate": 7.71e-07, "loss": 0.0, "num_tokens": 81968.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 4.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024601693730801344, "kl": 6.632786244153976e-05, "learning_rate": 7.74e-07, "loss": 0.0, "num_tokens": 82280.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.023117998614907265, "kl": 0.0003232210933674651, "learning_rate": 7.77e-07, "loss": 0.0, "num_tokens": 82499.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 4.833333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 5.515130043029785, "kl": 0.00047652318608015776, "learning_rate": 7.8e-07, "loss": -0.0411, "num_tokens": 82790.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 4.851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.011952731758356094, "kl": 0.00011453933620941825, "learning_rate": 7.830000000000001e-07, "loss": 0.0, "num_tokens": 83058.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.928591728210449, "kl": 0.0006977445736993104, "learning_rate": 7.860000000000001e-07, "loss": -0.0346, "num_tokens": 83348.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.726197242736816, "kl": 0.000552371478988789, "learning_rate": 7.890000000000001e-07, "loss": 0.0243, "num_tokens": 83707.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.907407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.03761586919426918, "kl": 0.0003040581941604614, "learning_rate": 7.920000000000001e-07, "loss": 0.0, "num_tokens": 83919.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 7.180360794067383, "kl": 0.0006064912013243884, "learning_rate": 7.95e-07, "loss": -0.0948, "num_tokens": 84215.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 4.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 4.671012878417969, "kl": 0.0007608090818393975, "learning_rate": 7.98e-07, "loss": -0.0114, "num_tokens": 84547.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 4.962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.696273326873779, "kl": 0.0005270361725706607, "learning_rate": 8.01e-07, "loss": 0.1728, "num_tokens": 84908.0, "reward": 0.42500001192092896, "reward_std": 0.14999999105930328, "rewards/reward_combined/mean": 0.42500001192092896, "rewards/reward_combined/std": 0.14999999105930328, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.981481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.0282697677612305, "kl": 0.0012080430533387698, "learning_rate": 8.04e-07, "loss": 0.0341, "num_tokens": 85193.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 5.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.011410972103476524, "kl": 0.0003721677167050075, "learning_rate": 8.070000000000001e-07, "loss": 0.0, "num_tokens": 85519.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.018518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 8.44157600402832, "kl": 0.00143510103225708, "learning_rate": 8.100000000000001e-07, "loss": -0.1716, "num_tokens": 85759.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 271 }, { "clip_ratio/high_max": 0.007462686393409967, "clip_ratio/high_mean": 0.007462686393409967, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007462686393409967, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 5.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.5781354904174805, "kl": 0.0007736670668236911, "learning_rate": 8.130000000000001e-07, "loss": 0.1346, "num_tokens": 86096.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 5.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 4.370683670043945, "kl": 0.0007411043479805812, "learning_rate": 8.160000000000001e-07, "loss": -0.0394, "num_tokens": 86372.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.6596527099609375, "kl": 0.0003138103329547448, "learning_rate": 8.190000000000001e-07, "loss": -0.0019, "num_tokens": 86658.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 5.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 3.514913558959961, "kl": 0.0005813508469145745, "learning_rate": 8.220000000000001e-07, "loss": 0.0046, "num_tokens": 86923.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 5.111111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009333821944892406, "kl": 3.992146775999572e-05, "learning_rate": 8.25e-07, "loss": 0.0, "num_tokens": 87200.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 5.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.005320832133293152, "kl": 7.409416139125824e-05, "learning_rate": 8.280000000000001e-07, "loss": 0.0, "num_tokens": 87460.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 5.148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.405511856079102, "kl": 0.0012021985021419823, "learning_rate": 8.310000000000001e-07, "loss": 0.0265, "num_tokens": 87799.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 5.166666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 13.089540481567383, "kl": 0.0009957485017366707, "learning_rate": 8.340000000000001e-07, "loss": -0.0519, "num_tokens": 88057.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 5.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005388214252889156, "kl": 0.00019227433949708939, "learning_rate": 8.370000000000001e-07, "loss": 0.0, "num_tokens": 88377.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 5.203703703703703, "frac_reward_zero_std": 0.0, "grad_norm": 2.229790210723877, "kl": 0.0011238459264859557, "learning_rate": 8.400000000000001e-07, "loss": 0.0275, "num_tokens": 88691.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 5.222222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.031806945800781, "kl": 0.0004676797325373627, "learning_rate": 8.430000000000001e-07, "loss": 0.1669, "num_tokens": 88970.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 5.2407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 5.256507873535156, "kl": 0.0004470757266972214, "learning_rate": 8.459999999999999e-07, "loss": 0.2016, "num_tokens": 89307.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.2592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.04802239313721657, "kl": 0.0007820095343049616, "learning_rate": 8.489999999999999e-07, "loss": 0.0, "num_tokens": 89579.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 5.277777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.3241055011749268, "kl": 0.0007035969611024484, "learning_rate": 8.52e-07, "loss": -0.0473, "num_tokens": 89933.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 5.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.4867427349090576, "kl": 0.0001704170208540745, "learning_rate": 8.55e-07, "loss": 0.1531, "num_tokens": 90312.0, "reward": 4.125, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.308422088623047, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 5.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015186072560027242, "kl": 3.7874280678806826e-05, "learning_rate": 8.58e-07, "loss": 0.0, "num_tokens": 90572.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 5.333333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018420862033963203, "kl": 2.727508581301663e-05, "learning_rate": 8.61e-07, "loss": 0.0, "num_tokens": 90792.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 5.351851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.638594150543213, "kl": 0.0003076635766774416, "learning_rate": 8.64e-07, "loss": 0.0638, "num_tokens": 91084.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.183140277862549, "kl": 0.0020975497318431735, "learning_rate": 8.669999999999999e-07, "loss": 0.1627, "num_tokens": 91345.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 5.388888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.027248112484812737, "kl": 0.0008877874352037907, "learning_rate": 8.699999999999999e-07, "loss": 0.0, "num_tokens": 91672.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 5.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.007399424444884062, "kl": 0.0001283470082853455, "learning_rate": 8.729999999999999e-07, "loss": 0.0, "num_tokens": 91981.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 5.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.7383952140808105, "kl": 0.0005632737884297967, "learning_rate": 8.76e-07, "loss": 0.031, "num_tokens": 92291.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 5.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 3.111842393875122, "kl": 0.00033309114223811775, "learning_rate": 8.79e-07, "loss": 0.0634, "num_tokens": 92633.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 5.462962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.002238117391243577, "kl": 5.358642010833137e-05, "learning_rate": 8.82e-07, "loss": 0.0, "num_tokens": 92952.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 5.481481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.035788360983133316, "kl": 0.0009891540103126317, "learning_rate": 8.85e-07, "loss": 0.0, "num_tokens": 93224.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07432208955287933, "kl": 0.0010253414511680603, "learning_rate": 8.88e-07, "loss": 0.0001, "num_tokens": 93436.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.518518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 2.9599413871765137, "kl": 0.00011427780555095524, "learning_rate": 8.91e-07, "loss": 0.0376, "num_tokens": 93727.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 5.537037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.6163761615753174, "kl": 0.0018340190581511706, "learning_rate": 8.939999999999999e-07, "loss": 0.3495, "num_tokens": 94107.0, "reward": 3.375, "reward_std": 4.643543720245361, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 4.643543720245361, "step": 299 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.014705882407724857, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014705882407724857, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 5.555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 3.5918540954589844, "kl": 0.0007441753841703758, "learning_rate": 8.969999999999999e-07, "loss": 0.0033, "num_tokens": 94399.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 5.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.023262349888682365, "kl": 0.0008120479760691524, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 94706.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.592592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.007687571458518505, "kl": 0.00011148526391480118, "learning_rate": 9.03e-07, "loss": 0.0, "num_tokens": 94985.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 5.611111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013503205263987184, "kl": 7.820480050213519e-06, "learning_rate": 9.06e-07, "loss": 0.0, "num_tokens": 95293.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 5.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.17464542388916, "kl": 0.0005902486263948958, "learning_rate": 9.09e-07, "loss": 0.1436, "num_tokens": 95598.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 5.648148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009366406593471766, "kl": 2.6514132514421362e-05, "learning_rate": 9.12e-07, "loss": 0.0, "num_tokens": 95814.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 84.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 84.0, "completions/mean_terminated_length": 26.666667938232422, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 5.666666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.890966773033142, "kl": 0.00028350279171718284, "learning_rate": 9.15e-07, "loss": 0.477, "num_tokens": 96394.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.685185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01795792579650879, "kl": 0.0003880493895849213, "learning_rate": 9.18e-07, "loss": 0.0, "num_tokens": 96665.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.011904762126505375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011904762126505375, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 5.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.3754703998565674, "kl": 0.0007309003995032981, "learning_rate": 9.210000000000001e-07, "loss": -0.1105, "num_tokens": 97011.0, "reward": 0.25, "reward_std": 0.5, "rewards/reward_combined/mean": 0.25, "rewards/reward_combined/std": 0.5, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 5.722222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.015370920300483704, "kl": 0.0002644389860506635, "learning_rate": 9.24e-07, "loss": 0.0, "num_tokens": 97267.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 5.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.880171775817871, "kl": 0.0003286706341896206, "learning_rate": 9.27e-07, "loss": 0.3679, "num_tokens": 97610.0, "reward": 3.75, "reward_std": 2.723355770111084, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 2.723355770111084, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 5.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.7711498737335205, "kl": 0.0010276629727741238, "learning_rate": 9.3e-07, "loss": 0.012, "num_tokens": 97961.0, "reward": 1.875, "reward_std": 1.75, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 1.75, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 5.777777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.014316629618406296, "kl": 0.00019802508359134663, "learning_rate": 9.33e-07, "loss": 0.0, "num_tokens": 98195.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 5.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020444500260055065, "kl": 4.9935530114453286e-05, "learning_rate": 9.36e-07, "loss": 0.0, "num_tokens": 98465.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 9.964408673113212e-05, "kl": 1.3820827007293701e-06, "learning_rate": 9.39e-07, "loss": 0.0, "num_tokens": 98685.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 5.833333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.5785439014434814, "kl": 0.0023260287125594914, "learning_rate": 9.42e-07, "loss": 0.0542, "num_tokens": 99022.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 5.851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006944568012841046, "kl": 7.503728056690306e-06, "learning_rate": 9.450000000000001e-07, "loss": 0.0, "num_tokens": 99331.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 5.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003281471144873649, "kl": 2.4133672695825226e-06, "learning_rate": 9.480000000000001e-07, "loss": 0.0, "num_tokens": 99695.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 5.888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.9598276615142822, "kl": 0.0005881217948626727, "learning_rate": 9.510000000000001e-07, "loss": -0.0436, "num_tokens": 99998.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 5.907407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 8.086658477783203, "kl": 0.000580109772272408, "learning_rate": 9.54e-07, "loss": -0.0999, "num_tokens": 100287.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 5.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.3228225708007812, "kl": 0.0007797148209647276, "learning_rate": 9.570000000000001e-07, "loss": -0.0007, "num_tokens": 100715.0, "reward": 0.125, "reward_std": 0.25, "rewards/reward_combined/mean": 0.125, "rewards/reward_combined/std": 0.25, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.944444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.004528549499809742, "kl": 0.0001575574278831482, "learning_rate": 9.600000000000001e-07, "loss": 0.0, "num_tokens": 100951.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 5.962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004139020573347807, "kl": 2.9399991035461426e-05, "learning_rate": 9.630000000000001e-07, "loss": 0.0, "num_tokens": 101157.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 5.981481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 3.0748610496520996, "kl": 0.0004400626348797232, "learning_rate": 9.660000000000002e-07, "loss": -0.2451, "num_tokens": 101554.0, "reward": 4.125, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.190763473510742, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 6.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.219311714172363, "kl": 0.0006390140042640269, "learning_rate": 9.69e-07, "loss": 0.0043, "num_tokens": 101904.0, "reward": 1.75, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.4433757066726685, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 6.018518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.005185985006392002, "kl": 0.00015741996321594343, "learning_rate": 9.72e-07, "loss": 0.0, "num_tokens": 102162.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 6.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.7087786197662354, "kl": 0.0027644065266940743, "learning_rate": 9.75e-07, "loss": -0.1186, "num_tokens": 102496.0, "reward": 3.25, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 3.4034297466278076, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 6.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 4.0765700340271, "kl": 0.0005693767161574215, "learning_rate": 9.78e-07, "loss": -0.0133, "num_tokens": 102785.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006172839552164078, "clip_ratio/low_min": 0.006172839552164078, "clip_ratio/region_mean": 0.006172839552164078, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 6.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.328338623046875, "kl": 0.000910238508367911, "learning_rate": 9.81e-07, "loss": -0.0884, "num_tokens": 103218.0, "reward": 0.675000011920929, "reward_std": 1.0436315536499023, "rewards/reward_combined/mean": 0.675000011920929, "rewards/reward_combined/std": 1.0436315536499023, "step": 328 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.009615384973585606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 6.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 6.679901599884033, "kl": 0.0006505083292722702, "learning_rate": 9.84e-07, "loss": 0.1025, "num_tokens": 103533.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 6.111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 16.89786148071289, "kl": 0.006327513605356216, "learning_rate": 9.87e-07, "loss": 0.3069, "num_tokens": 103751.0, "reward": 1.625, "reward_std": 3.75, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 3.75, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 6.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004182321485131979, "kl": 0.00012972205877304077, "learning_rate": 9.9e-07, "loss": 0.0, "num_tokens": 103987.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 6.148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 13.095587730407715, "kl": 0.0001467828915338032, "learning_rate": 9.93e-07, "loss": 0.1542, "num_tokens": 104254.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 6.166666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005609396612271667, "kl": 4.268196789780632e-06, "learning_rate": 9.96e-07, "loss": 0.0, "num_tokens": 104561.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 6.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 7.383676052093506, "kl": 0.0007989190053194761, "learning_rate": 9.99e-07, "loss": 0.0026, "num_tokens": 104833.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 6.203703703703703, "frac_reward_zero_std": 1.0, "grad_norm": 0.02250610664486885, "kl": 0.00023718326701782644, "learning_rate": 1.002e-06, "loss": 0.0, "num_tokens": 105046.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 6.222222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.012680215761065483, "kl": 0.00019833587248285767, "learning_rate": 1.0050000000000001e-06, "loss": 0.0, "num_tokens": 105281.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.2407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.60512638092041, "kl": 0.0003479543811408803, "learning_rate": 1.0080000000000001e-06, "loss": -0.0261, "num_tokens": 105571.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.2592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.011472498998045921, "kl": 0.00012253136264916975, "learning_rate": 1.0110000000000001e-06, "loss": 0.0, "num_tokens": 105849.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 6.277777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.007044440601021051, "kl": 0.0002377020791755058, "learning_rate": 1.0140000000000002e-06, "loss": 0.0, "num_tokens": 106144.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 86.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 86.0, "completions/mean_terminated_length": 29.33333396911621, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 6.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.2721306085586548, "kl": 0.0005542633589357138, "learning_rate": 1.0170000000000002e-06, "loss": 0.4945, "num_tokens": 106704.0, "reward": 1.125, "reward_std": 1.8427786827087402, "rewards/reward_combined/mean": 1.125, "rewards/reward_combined/std": 1.8427786827087402, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 6.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.622669696807861, "kl": 0.00036851988988928497, "learning_rate": 1.0200000000000002e-06, "loss": -0.0036, "num_tokens": 107006.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 6.333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 16.67360496520996, "kl": 0.0004253744962170458, "learning_rate": 1.0230000000000002e-06, "loss": -0.2063, "num_tokens": 107223.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 6.351851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 6.969684600830078, "kl": 0.0002880445899791084, "learning_rate": 1.026e-06, "loss": 0.1829, "num_tokens": 107489.0, "reward": 2.625, "reward_std": 1.75, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 1.75, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.946965456008911, "kl": 0.0006369232141878456, "learning_rate": 1.029e-06, "loss": 0.1015, "num_tokens": 107810.0, "reward": 3.25, "reward_std": 3.0686588287353516, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 3.0686588287353516, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 75.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 6.388888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.1811447143554688, "kl": 0.000475488806841895, "learning_rate": 1.032e-06, "loss": 0.3989, "num_tokens": 108350.0, "reward": 0.30000001192092896, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 0.30000001192092896, "rewards/reward_combined/std": 0.4000000059604645, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 6.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 2.07658052444458, "kl": 0.000156188674736768, "learning_rate": 1.035e-06, "loss": 0.2415, "num_tokens": 108689.0, "reward": 3.0, "reward_std": 5.196152210235596, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 5.196152210235596, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 6.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.025724634528160095, "kl": 0.0005815433396492153, "learning_rate": 1.0379999999999998e-06, "loss": 0.0, "num_tokens": 109009.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 6.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009777398081496358, "kl": 4.061988875037059e-05, "learning_rate": 1.0409999999999999e-06, "loss": 0.0, "num_tokens": 109286.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 6.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.8238067626953125, "kl": 0.0007863019127398729, "learning_rate": 1.0439999999999999e-06, "loss": 0.1609, "num_tokens": 109641.0, "reward": 3.0, "reward_std": 3.188521146774292, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 3.188521146774292, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 6.481481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.00102147925645113, "kl": 2.3031341243040515e-05, "learning_rate": 1.0469999999999999e-06, "loss": 0.0, "num_tokens": 109884.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 6.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.23366641998291, "kl": 0.00045225843496154994, "learning_rate": 1.05e-06, "loss": 0.1784, "num_tokens": 110246.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 6.518518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 6.293183832895011e-05, "kl": 1.762683154993283e-06, "learning_rate": 1.053e-06, "loss": 0.0, "num_tokens": 110610.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 6.537037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.090112686157227, "kl": 0.0006403782754205167, "learning_rate": 1.056e-06, "loss": -0.1273, "num_tokens": 110944.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 6.555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 4.235172748565674, "kl": 0.0005330756175681017, "learning_rate": 1.059e-06, "loss": 0.0892, "num_tokens": 111214.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 6.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.1353631019592285, "kl": 0.000659962504869327, "learning_rate": 1.062e-06, "loss": -0.0512, "num_tokens": 111480.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 3.5032594203948975, "kl": 0.001133062643930316, "learning_rate": 1.065e-06, "loss": -0.3193, "num_tokens": 111827.0, "reward": 1.375, "reward_std": 2.3228933811187744, "rewards/reward_combined/mean": 1.375, "rewards/reward_combined/std": 2.3228933811187744, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 6.611111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.8815441131591797, "kl": 0.00030877380777383223, "learning_rate": 1.068e-06, "loss": -0.0348, "num_tokens": 112131.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 6.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 2.957538890768774e-05, "kl": 4.023313522338867e-07, "learning_rate": 1.071e-06, "loss": 0.0, "num_tokens": 112351.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.648148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.019362876191735268, "kl": 0.0006637527840211987, "learning_rate": 1.074e-06, "loss": 0.0, "num_tokens": 112654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 6.666666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 4.3079376220703125, "kl": 0.004450089356396347, "learning_rate": 1.077e-06, "loss": 0.0795, "num_tokens": 112946.0, "reward": 7.125, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 7.125, "rewards/reward_combined/std": 1.4361406564712524, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.8428308963775635, "kl": 6.044354233836202e-05, "learning_rate": 1.08e-06, "loss": 0.0624, "num_tokens": 113234.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 6.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.336367130279541, "kl": 0.00424649418619083, "learning_rate": 1.083e-06, "loss": 0.0272, "num_tokens": 113541.0, "reward": 3.75, "reward_std": 5.057997226715088, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 5.057997226715088, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 6.722222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.02751585654914379, "kl": 0.0002781063230941072, "learning_rate": 1.086e-06, "loss": 0.0, "num_tokens": 113797.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 6.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 5.1264190673828125, "kl": 0.00015835894737392664, "learning_rate": 1.089e-06, "loss": 0.0531, "num_tokens": 114090.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 93.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 93.75, "completions/mean_terminated_length": 39.66666793823242, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 6.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 1.091813325881958, "kl": 0.0004173710767645389, "learning_rate": 1.092e-06, "loss": 0.4636, "num_tokens": 114689.0, "reward": 0.2999999523162842, "reward_std": 5.062279224395752, "rewards/reward_combined/mean": 0.2999999523162842, "rewards/reward_combined/std": 5.062278747558594, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 6.777777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.011059912852942944, "kl": 0.00016621185568510555, "learning_rate": 1.095e-06, "loss": 0.0, "num_tokens": 115013.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 6.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.006694111507385969, "kl": 0.00016759575373725966, "learning_rate": 1.098e-06, "loss": 0.0, "num_tokens": 115342.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 6.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008660645224153996, "kl": 0.0002451826585456729, "learning_rate": 1.101e-06, "loss": 0.0, "num_tokens": 115624.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 6.833333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 8.97185230255127, "kl": 0.0005101642454974353, "learning_rate": 1.104e-06, "loss": 0.3231, "num_tokens": 115859.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 369 }, { "clip_ratio/high_max": 0.006849315017461777, "clip_ratio/high_mean": 0.006849315017461777, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006849315017461777, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 6.851851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.35051965713501, "kl": 0.0006835753229097463, "learning_rate": 1.107e-06, "loss": 0.1762, "num_tokens": 116219.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 6.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.666428327560425, "kl": 0.0007801286119502038, "learning_rate": 1.11e-06, "loss": 0.0288, "num_tokens": 116480.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 6.888888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01839413307607174, "kl": 0.0003345608856761828, "learning_rate": 1.113e-06, "loss": 0.0, "num_tokens": 116740.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 6.907407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 7.535190105438232, "kl": 0.0002525041636545211, "learning_rate": 1.116e-06, "loss": 0.0665, "num_tokens": 117057.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 6.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.119668483734131, "kl": 0.0008038908708840609, "learning_rate": 1.119e-06, "loss": 0.0319, "num_tokens": 117350.0, "reward": 3.5, "reward_std": 5.446711540222168, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 5.446711540222168, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 6.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 2.4843716621398926, "kl": 0.00026746795265353285, "learning_rate": 1.122e-06, "loss": -0.0762, "num_tokens": 117673.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.003542846767231822, "kl": 6.279822991928086e-05, "learning_rate": 1.125e-06, "loss": 0.0, "num_tokens": 117941.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 6.981481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071050627157092094, "kl": 0.0002686096850084141, "learning_rate": 1.128e-06, "loss": 0.0, "num_tokens": 118209.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 7.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0074279578402638435, "kl": 7.043033838272095e-05, "learning_rate": 1.131e-06, "loss": 0.0, "num_tokens": 118415.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 7.018518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.02381051518023014, "kl": 0.00021494428074220195, "learning_rate": 1.134e-06, "loss": 0.0, "num_tokens": 118631.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 7.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.0429561138153076, "kl": 0.0007445274095516652, "learning_rate": 1.137e-06, "loss": 0.1676, "num_tokens": 119008.0, "reward": 2.625, "reward_std": 3.6371922492980957, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 3.6371922492980957, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001879699295386672, "clip_ratio/low_min": 0.001879699295386672, "clip_ratio/region_mean": 0.001879699295386672, "completion_length": 71.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 71.5, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 1.623712420463562, "kl": 0.0001712968969513895, "learning_rate": 1.14e-06, "loss": 0.4876, "num_tokens": 119514.0, "reward": 1.0499999523162842, "reward_std": 3.2264533042907715, "rewards/reward_combined/mean": 1.0499999523162842, "rewards/reward_combined/std": 3.2264530658721924, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 7.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.947168827056885, "kl": 0.0006473724788520485, "learning_rate": 1.1430000000000001e-06, "loss": 0.2235, "num_tokens": 119819.0, "reward": 3.125, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.4361406564712524, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008771929889917374, "clip_ratio/low_min": 0.008771929889917374, "clip_ratio/region_mean": 0.008771929889917374, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 7.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 4.786294937133789, "kl": 0.0009779173415154219, "learning_rate": 1.1460000000000001e-06, "loss": 0.2366, "num_tokens": 120158.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.111111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02692675217986107, "kl": 0.00034688253072090447, "learning_rate": 1.1490000000000001e-06, "loss": 0.0, "num_tokens": 120434.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 7.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.033407099545001984, "kl": 0.0017122626304626465, "learning_rate": 1.1520000000000002e-06, "loss": 0.0001, "num_tokens": 120670.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.3273794651031494, "kl": 0.0006077333237044513, "learning_rate": 1.155e-06, "loss": -0.0781, "num_tokens": 120963.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.166666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 4.8814287185668945, "kl": 0.0002583457971923053, "learning_rate": 1.158e-06, "loss": 0.0313, "num_tokens": 121230.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 7.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.018314838409424, "kl": 0.0004732194356620312, "learning_rate": 1.161e-06, "loss": 0.0882, "num_tokens": 121567.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 7.203703703703703, "frac_reward_zero_std": 1.0, "grad_norm": 0.004768583457916975, "kl": 7.208179499684775e-05, "learning_rate": 1.164e-06, "loss": 0.0, "num_tokens": 121874.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 7.222222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.00903977919369936, "kl": 0.00011473968697828241, "learning_rate": 1.167e-06, "loss": 0.0, "num_tokens": 122192.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 7.2407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.260708808898926, "kl": 0.0007341466553043574, "learning_rate": 1.17e-06, "loss": 0.0367, "num_tokens": 122518.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 7.2592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.04728669673204422, "kl": 0.002906686277128756, "learning_rate": 1.173e-06, "loss": 0.0002, "num_tokens": 122802.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.277777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.009743906557559967, "kl": 0.00012787146988557652, "learning_rate": 1.176e-06, "loss": 0.0, "num_tokens": 123059.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 7.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.877381443977356, "kl": 0.0004237563698552549, "learning_rate": 1.179e-06, "loss": -0.0545, "num_tokens": 123477.0, "reward": 0.875, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 0.875, "rewards/reward_combined/std": 1.4361406564712524, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 87.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 87.0, "completions/mean_terminated_length": 87.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 7.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.5418593883514404, "kl": 0.00028770643984898925, "learning_rate": 1.182e-06, "loss": 0.346, "num_tokens": 124049.0, "reward": 5.175000190734863, "reward_std": 4.650000095367432, "rewards/reward_combined/mean": 5.175000190734863, "rewards/reward_combined/std": 4.650000095367432, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 7.333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.6314048767089844, "kl": 0.009780138731002808, "learning_rate": 1.185e-06, "loss": 0.0369, "num_tokens": 124348.0, "reward": 3.875, "reward_std": 4.75, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 4.75, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 7.351851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.008155149407684803, "kl": 0.0002987432962981984, "learning_rate": 1.188e-06, "loss": 0.0, "num_tokens": 124673.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 7.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 6.014007522026077e-05, "kl": 7.972121238708496e-07, "learning_rate": 1.1910000000000001e-06, "loss": 0.0, "num_tokens": 124893.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 7.388888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.616152286529541, "kl": 0.0004403176426421851, "learning_rate": 1.1940000000000001e-06, "loss": 0.1236, "num_tokens": 125239.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.042800359427928925, "kl": 0.0007947605045046657, "learning_rate": 1.1970000000000001e-06, "loss": 0.0, "num_tokens": 125515.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 7.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0063782949000597, "kl": 0.00010485592065379024, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "num_tokens": 125833.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 7.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 5.440243721008301, "kl": 0.0008377085032407194, "learning_rate": 1.2030000000000002e-06, "loss": -0.0047, "num_tokens": 126140.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 7.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.803163528442383, "kl": 0.00040737282688496634, "learning_rate": 1.2060000000000002e-06, "loss": 0.0286, "num_tokens": 126448.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 403 }, { "clip_ratio/high_max": 0.010204081423580647, "clip_ratio/high_mean": 0.010204081423580647, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010204081423580647, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 7.481481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.568783760070801, "kl": 0.0010241676936857402, "learning_rate": 1.2090000000000002e-06, "loss": 0.125, "num_tokens": 126782.0, "reward": 1.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 7.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.622627258300781, "kl": 0.0013473780127242208, "learning_rate": 1.2120000000000002e-06, "loss": -0.0242, "num_tokens": 127086.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 7.518518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 2.908332586288452, "kl": 0.00019460863404674456, "learning_rate": 1.215e-06, "loss": 0.0158, "num_tokens": 127379.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.537037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.318586587905884, "kl": 0.000286047605186468, "learning_rate": 1.218e-06, "loss": 0.0681, "num_tokens": 127657.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 7.555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 10.478938102722168, "kl": 0.002324142144061625, "learning_rate": 1.221e-06, "loss": 0.0181, "num_tokens": 127875.0, "reward": 3.125, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.4361406564712524, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.6333165168762207, "kl": 0.0003788674366660416, "learning_rate": 1.224e-06, "loss": 0.0038, "num_tokens": 128157.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.592592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.005975629203021526, "kl": 4.957751730216842e-05, "learning_rate": 1.2269999999999999e-06, "loss": 0.0, "num_tokens": 128453.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 7.611111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004418897908180952, "kl": 5.195409289626696e-05, "learning_rate": 1.2299999999999999e-06, "loss": 0.0, "num_tokens": 128819.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 74.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 74.75, "completions/mean_terminated_length": 14.333333969116211, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 7.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.937324047088623, "kl": 0.00043015779374400154, "learning_rate": 1.2329999999999999e-06, "loss": 0.4479, "num_tokens": 129338.0, "reward": 3.25, "reward_std": 4.907477378845215, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 4.907477378845215, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 7.648148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.567517042160034, "kl": 0.014520742231979966, "learning_rate": 1.236e-06, "loss": 0.1298, "num_tokens": 129635.0, "reward": 5.25, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 3.4034297466278076, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 7.666666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.019435608759522438, "kl": 0.0003772839190787636, "learning_rate": 1.239e-06, "loss": 0.0, "num_tokens": 129869.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 7.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.85819149017334, "kl": 0.004227758385241032, "learning_rate": 1.242e-06, "loss": 0.0502, "num_tokens": 130156.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 7.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.305883884429932, "kl": 0.0005044145509600639, "learning_rate": 1.245e-06, "loss": 0.0039, "num_tokens": 130446.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.722222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.05890405923128128, "kl": 0.001081001479178667, "learning_rate": 1.248e-06, "loss": 0.0001, "num_tokens": 130708.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.6684036254882812, "kl": 7.339137391682016e-05, "learning_rate": 1.251e-06, "loss": 0.0325, "num_tokens": 130980.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 7.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 5.540816783905029, "kl": 0.00023993071226868778, "learning_rate": 1.254e-06, "loss": 0.0785, "num_tokens": 131243.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 7.777777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.025105897337198257, "kl": 0.0004859610344283283, "learning_rate": 1.257e-06, "loss": 0.0, "num_tokens": 131566.0, "reward": 0.5, "reward_std": 0.0, "rewards/reward_combined/mean": 0.5, "rewards/reward_combined/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 7.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00729712937027216, "kl": 0.00019451748084975407, "learning_rate": 1.26e-06, "loss": 0.0, "num_tokens": 131880.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 7.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10664273798465729, "kl": 0.001314006745815277, "learning_rate": 1.263e-06, "loss": 0.0001, "num_tokens": 132092.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 7.833333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.205585479736328, "kl": 0.0014551758067682385, "learning_rate": 1.266e-06, "loss": 0.2398, "num_tokens": 132467.0, "reward": 2.375, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 3.4247870445251465, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 7.851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.014647755771875381, "kl": 0.00028278891841182485, "learning_rate": 1.269e-06, "loss": 0.0, "num_tokens": 132735.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 7.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04798325151205063, "kl": 0.0003249421715736389, "learning_rate": 1.272e-06, "loss": 0.0, "num_tokens": 132941.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 7.888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.873548984527588, "kl": 0.0014190449146553874, "learning_rate": 1.275e-06, "loss": 0.0396, "num_tokens": 133245.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 7.907407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 4.278851509094238, "kl": 0.007844563573598862, "learning_rate": 1.278e-06, "loss": 0.0466, "num_tokens": 133507.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 7.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.046332165598869324, "kl": 0.0006397667457349598, "learning_rate": 1.281e-06, "loss": 0.0, "num_tokens": 133756.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 7.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 3.288076400756836, "kl": 0.0013455498265102506, "learning_rate": 1.284e-06, "loss": 0.0601, "num_tokens": 134089.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.006751934066414833, "kl": 0.00013109322026139125, "learning_rate": 1.287e-06, "loss": 0.0, "num_tokens": 134351.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 7.981481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 7.11538553237915, "kl": 0.0004287466290406883, "learning_rate": 1.29e-06, "loss": 0.3603, "num_tokens": 134588.0, "reward": 2.625, "reward_std": 2.75, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 2.75, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.785628795623779, "kl": 0.0006086694047553465, "learning_rate": 1.293e-06, "loss": 0.1831, "num_tokens": 134869.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.018518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.176998615264893, "kl": 0.001978690270334482, "learning_rate": 1.296e-06, "loss": -0.0606, "num_tokens": 135158.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 8.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.013280250132083893, "kl": 0.0003153277466481086, "learning_rate": 1.299e-06, "loss": 0.0, "num_tokens": 135465.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 75.25, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 4.434341907501221, "kl": 0.0005841281672473997, "learning_rate": 1.302e-06, "loss": 0.3676, "num_tokens": 135982.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 8.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 6.653457164764404, "kl": 0.0009990200342144817, "learning_rate": 1.305e-06, "loss": -0.068, "num_tokens": 136246.0, "reward": 2.0, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 1.7320507764816284, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 7.002219200134277, "kl": 0.0002858766383724287, "learning_rate": 1.308e-06, "loss": 0.1682, "num_tokens": 136530.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 85.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 85.25, "completions/mean_terminated_length": 28.33333396911621, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 8.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.5005788803100586, "kl": 0.001248143264092505, "learning_rate": 1.311e-06, "loss": 0.2577, "num_tokens": 137123.0, "reward": 2.924999952316284, "reward_std": 3.3944807052612305, "rewards/reward_combined/mean": 2.924999952316284, "rewards/reward_combined/std": 3.3944807052612305, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 62.5, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 62.5, "completions/mean_terminated_length": 62.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 8.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.4210071563720703, "kl": 0.001322774973232299, "learning_rate": 1.314e-06, "loss": 0.0425, "num_tokens": 137601.0, "reward": 4.550000190734863, "reward_std": 3.07300066947937, "rewards/reward_combined/mean": 4.550000190734863, "rewards/reward_combined/std": 3.07300066947937, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 8.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 6.893808364868164, "kl": 0.001215376891195774, "learning_rate": 1.317e-06, "loss": -0.0023, "num_tokens": 137905.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 8.166666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.039437662810087204, "kl": 0.0012664295791182667, "learning_rate": 1.32e-06, "loss": 0.0001, "num_tokens": 138235.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 8.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.2584749460220337, "kl": 7.745823495497461e-06, "learning_rate": 1.323e-06, "loss": 0.0013, "num_tokens": 138598.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02892359346151352, "kl": 0.0005071528576081619, "learning_rate": 1.326e-06, "loss": 0.0, "num_tokens": 138864.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 8.222222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 2.531430721282959, "kl": 0.00043822579027619213, "learning_rate": 1.3290000000000001e-06, "loss": -0.0243, "num_tokens": 139166.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.594545841217041, "kl": 0.00045026788575341925, "learning_rate": 1.3320000000000001e-06, "loss": 0.0932, "num_tokens": 139439.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.043402764946222305, "kl": 0.0017976841118070297, "learning_rate": 1.3350000000000001e-06, "loss": 0.0001, "num_tokens": 139729.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 8.277777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 2.8853728771209717, "kl": 0.009266020730137825, "learning_rate": 1.3380000000000001e-06, "loss": 0.0311, "num_tokens": 140021.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 8.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.021646499633789, "kl": 0.01693438400980085, "learning_rate": 1.3410000000000002e-06, "loss": 0.0839, "num_tokens": 140283.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 8.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.017803365364670753, "kl": 0.0006827447505202144, "learning_rate": 1.344e-06, "loss": 0.0, "num_tokens": 140606.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 5.326924800872803, "kl": 0.0030778807904425776, "learning_rate": 1.347e-06, "loss": 0.0366, "num_tokens": 140880.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 8.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.021769413724541664, "kl": 0.0008617108105681837, "learning_rate": 1.35e-06, "loss": 0.0001, "num_tokens": 141227.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 8.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.3093459606170654, "kl": 0.00042448812746442854, "learning_rate": 1.353e-06, "loss": 0.1012, "num_tokens": 141491.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03047255426645279, "kl": 0.0015327175497077405, "learning_rate": 1.356e-06, "loss": 0.0001, "num_tokens": 141772.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 3.3602254390716553, "kl": 0.00742534501478076, "learning_rate": 1.359e-06, "loss": 0.1568, "num_tokens": 142068.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 8.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.011161944828927517, "kl": 0.00017811357975006104, "learning_rate": 1.362e-06, "loss": 0.0, "num_tokens": 142280.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 8.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 6.692661762237549, "kl": 0.0005987969925627112, "learning_rate": 1.365e-06, "loss": -0.0898, "num_tokens": 142544.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 8.462962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.03104279935359955, "kl": 0.00034093111753463745, "learning_rate": 1.368e-06, "loss": 0.0, "num_tokens": 142764.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 8.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05662114918231964, "kl": 0.001474709075409919, "learning_rate": 1.371e-06, "loss": 0.0001, "num_tokens": 142996.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 8.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.059370517730713, "kl": 0.0018500362057238817, "learning_rate": 1.374e-06, "loss": 0.1937, "num_tokens": 143269.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 8.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.037880558520555496, "kl": 0.000834152102470398, "learning_rate": 1.3770000000000001e-06, "loss": 0.0, "num_tokens": 143477.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 8.537037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 2.1003763675689697, "kl": 0.0006077908328734338, "learning_rate": 1.3800000000000001e-06, "loss": 0.0427, "num_tokens": 143884.0, "reward": 0.125, "reward_std": 0.25, "rewards/reward_combined/mean": 0.125, "rewards/reward_combined/std": 0.25, "step": 461 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 8.555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 3.5658528804779053, "kl": 0.030395436100661755, "learning_rate": 1.3830000000000001e-06, "loss": 0.0682, "num_tokens": 144186.0, "reward": 3.5, "reward_std": 3.488075017929077, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 3.488075017929077, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 8.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.026647405698895454, "kl": 0.00041704023715283256, "learning_rate": 1.3860000000000002e-06, "loss": 0.0, "num_tokens": 144420.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 8.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.35224986076355, "kl": 0.0005700888286810368, "learning_rate": 1.3890000000000002e-06, "loss": 0.2826, "num_tokens": 144802.0, "reward": 1.75, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.4433757066726685, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 8.050783157348633, "kl": 0.0005874587732250802, "learning_rate": 1.3920000000000002e-06, "loss": 0.1148, "num_tokens": 145105.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 8.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.94515323638916, "kl": 0.0006666499830316752, "learning_rate": 1.3950000000000002e-06, "loss": -0.1806, "num_tokens": 145466.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 8.648148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016897855093702674, "kl": 4.260614514350891e-05, "learning_rate": 1.3980000000000002e-06, "loss": 0.0, "num_tokens": 145710.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 8.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.4314656257629395, "kl": 0.0016374941042158753, "learning_rate": 1.401e-06, "loss": 0.1133, "num_tokens": 146066.0, "reward": 0.625, "reward_std": 0.25, "rewards/reward_combined/mean": 0.625, "rewards/reward_combined/std": 0.25, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 8.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.6874823570251465, "kl": 0.0025886285584419966, "learning_rate": 1.404e-06, "loss": 0.014, "num_tokens": 146380.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 8.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.0305562019348145, "kl": 0.0030521515873260796, "learning_rate": 1.407e-06, "loss": 0.1832, "num_tokens": 146725.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 8.722222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 2.66194224357605, "kl": 0.0007765647023916245, "learning_rate": 1.41e-06, "loss": 0.0002, "num_tokens": 147092.0, "reward": 1.5, "reward_std": 1.3540064096450806, "rewards/reward_combined/mean": 1.5, "rewards/reward_combined/std": 1.3540064096450806, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.3744161128997803, "kl": 0.0008247126825153828, "learning_rate": 1.4129999999999999e-06, "loss": 0.1106, "num_tokens": 147377.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 8.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.0727128982543945, "kl": 0.001286379061639309, "learning_rate": 1.4159999999999999e-06, "loss": 0.0786, "num_tokens": 147719.0, "reward": 1.125, "reward_std": 1.8427786827087402, "rewards/reward_combined/mean": 1.125, "rewards/reward_combined/std": 1.8427786827087402, "step": 473 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.02500000037252903, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02500000037252903, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 8.777777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 4.285005569458008, "kl": 0.0018552718393038958, "learning_rate": 1.4189999999999999e-06, "loss": 0.0381, "num_tokens": 147986.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 8.796296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 6.263662815093994, "kl": 0.001372149446979165, "learning_rate": 1.422e-06, "loss": 0.0716, "num_tokens": 148291.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 8.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03566886484622955, "kl": 0.0007755518017802387, "learning_rate": 1.425e-06, "loss": 0.0, "num_tokens": 148509.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 8.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018361331894993782, "kl": 0.00014239762822398916, "learning_rate": 1.428e-06, "loss": 0.0, "num_tokens": 148789.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 8.851851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.03665432706475258, "kl": 0.0007165893621277064, "learning_rate": 1.431e-06, "loss": 0.0, "num_tokens": 149045.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 80.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 80.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 8.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.69806706905365, "kl": 0.0013292425137478858, "learning_rate": 1.434e-06, "loss": 0.4274, "num_tokens": 149619.0, "reward": 5.25, "reward_std": 5.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 5.5, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 8.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.148035526275635, "kl": 0.01693603489547968, "learning_rate": 1.437e-06, "loss": 0.0869, "num_tokens": 149884.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 8.907407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.436980724334717, "kl": 0.002011268137721345, "learning_rate": 1.44e-06, "loss": -0.046, "num_tokens": 150194.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 8.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.014376312494277954, "kl": 0.0006001824513077736, "learning_rate": 1.443e-06, "loss": 0.0, "num_tokens": 150506.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 8.944444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.008478103205561638, "kl": 0.000158771472342778, "learning_rate": 1.446e-06, "loss": 0.0, "num_tokens": 150831.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 8.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075084068812429905, "kl": 0.0004975050687789917, "learning_rate": 1.449e-06, "loss": 0.0, "num_tokens": 151067.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 8.981481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.021673977375030518, "kl": 0.0014037236687727273, "learning_rate": 1.452e-06, "loss": 0.0001, "num_tokens": 151362.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 9.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06421205401420593, "kl": 0.0022028908133506775, "learning_rate": 1.455e-06, "loss": 0.0001, "num_tokens": 151576.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 9.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.010776751674711704, "kl": 0.00019974775204900652, "learning_rate": 1.458e-06, "loss": 0.0, "num_tokens": 151897.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 9.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.005626564845442772, "kl": 0.00015673040979891084, "learning_rate": 1.461e-06, "loss": 0.0, "num_tokens": 152193.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 9.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0110141821205616, "kl": 0.0002580478831077926, "learning_rate": 1.464e-06, "loss": 0.0, "num_tokens": 152449.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 72.5, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 72.5, "completions/mean_terminated_length": 72.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 9.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.9944734573364258, "kl": 0.0009289936569985002, "learning_rate": 1.467e-06, "loss": 0.4543, "num_tokens": 152963.0, "reward": 2.674999952316284, "reward_std": 5.974040985107422, "rewards/reward_combined/mean": 2.674999952316284, "rewards/reward_combined/std": 5.974040985107422, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 9.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.14732055366039276, "kl": 0.004145276732742786, "learning_rate": 1.4700000000000001e-06, "loss": 0.0002, "num_tokens": 153237.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 9.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05999904125928879, "kl": 0.0030060313874855638, "learning_rate": 1.473e-06, "loss": 0.0002, "num_tokens": 153507.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 9.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005623538745567203, "kl": 1.4913433005858678e-05, "learning_rate": 1.476e-06, "loss": 0.0, "num_tokens": 153815.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 9.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 7.0974884033203125, "kl": 0.001276601484278217, "learning_rate": 1.479e-06, "loss": 0.3508, "num_tokens": 154077.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006172839552164078, "clip_ratio/low_min": 0.006172839552164078, "clip_ratio/region_mean": 0.006172839552164078, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 9.166666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.2115442752838135, "kl": 0.02928131737280637, "learning_rate": 1.482e-06, "loss": -0.0329, "num_tokens": 154504.0, "reward": -0.8250000476837158, "reward_std": 2.3286263942718506, "rewards/reward_combined/mean": -0.8250000476837158, "rewards/reward_combined/std": 2.3286263942718506, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.25, "completions/mean_terminated_length": 2.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 9.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 13.164260864257812, "kl": 0.017533445730805397, "learning_rate": 1.485e-06, "loss": 0.0917, "num_tokens": 154717.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 100.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 100.75, "completions/mean_terminated_length": 49.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 9.203703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.6346595287323, "kl": 0.001183508342364803, "learning_rate": 1.488e-06, "loss": -0.1579, "num_tokens": 155344.0, "reward": 1.5499999523162842, "reward_std": 4.050926208496094, "rewards/reward_combined/mean": 1.5499999523162842, "rewards/reward_combined/std": 4.0509257316589355, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 9.222222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016270643391180784, "kl": 3.6284327507019043e-06, "learning_rate": 1.491e-06, "loss": 0.0, "num_tokens": 155564.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 9.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.976415634155273, "kl": 0.001442478969693184, "learning_rate": 1.494e-06, "loss": 0.1614, "num_tokens": 155884.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 9.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01832779496908188, "kl": 0.0006520260430988856, "learning_rate": 1.497e-06, "loss": 0.0, "num_tokens": 156198.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 9.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.001106397365219891, "kl": 1.572271139593795e-05, "learning_rate": 1.5e-06, "loss": 0.0, "num_tokens": 156468.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.436293363571167, "kl": 0.001056065782904625, "learning_rate": 1.503e-06, "loss": 0.033, "num_tokens": 156763.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 9.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.309901237487793, "kl": 0.0021982649923302233, "learning_rate": 1.506e-06, "loss": 0.1451, "num_tokens": 157100.0, "reward": 2.5, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.674234628677368, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 9.333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.007930577732622623, "kl": 0.0005407258868217468, "learning_rate": 1.509e-06, "loss": 0.0, "num_tokens": 157336.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 9.351851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 1.4427762031555176, "kl": 0.00042985317577404203, "learning_rate": 1.512e-06, "loss": 0.0002, "num_tokens": 157700.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 9.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.011734436266124249, "kl": 0.0004800920287379995, "learning_rate": 1.5150000000000001e-06, "loss": 0.0, "num_tokens": 158032.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07170011103153229, "kl": 0.005940357630606741, "learning_rate": 1.5180000000000001e-06, "loss": 0.0003, "num_tokens": 158315.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 9.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 4.212244987487793, "kl": 0.005429249256849289, "learning_rate": 1.5210000000000001e-06, "loss": 0.0288, "num_tokens": 158619.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 9.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.009870851412415504, "kl": 0.0002587953640613705, "learning_rate": 1.5240000000000001e-06, "loss": 0.0, "num_tokens": 158949.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 9.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 8.560111999511719, "kl": 0.002522015245631337, "learning_rate": 1.5270000000000002e-06, "loss": -0.0123, "num_tokens": 159242.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 9.462962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 5.170704364776611, "kl": 0.002358070865739137, "learning_rate": 1.53e-06, "loss": -0.0019, "num_tokens": 159547.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.015831032767891884, "kl": 0.0006280697125475854, "learning_rate": 1.533e-06, "loss": 0.0, "num_tokens": 159830.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 9.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.864472389221191, "kl": 0.003002442157594487, "learning_rate": 1.536e-06, "loss": 0.1529, "num_tokens": 160170.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 9.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05545349791646004, "kl": 0.0025941634085029364, "learning_rate": 1.539e-06, "loss": 0.0001, "num_tokens": 160462.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 9.537037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.017333369702100754, "kl": 0.0011692616099026054, "learning_rate": 1.542e-06, "loss": 0.0001, "num_tokens": 160832.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 9.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.05681229010224342, "kl": 0.0010110288858413696, "learning_rate": 1.545e-06, "loss": 0.0001, "num_tokens": 161092.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 82.75, "completions/mean_terminated_length": 25.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 9.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.5500227212905884, "kl": 0.00148715078830719, "learning_rate": 1.548e-06, "loss": 0.4173, "num_tokens": 161651.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 9.592592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019148801220580935, "kl": 0.00011230686141061597, "learning_rate": 1.551e-06, "loss": 0.0, "num_tokens": 161931.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.5351691246032715, "kl": 0.6368295695865527, "learning_rate": 1.554e-06, "loss": 0.0305, "num_tokens": 162229.0, "reward": 7.0, "reward_std": 2.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 2.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 9.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.008595379069447517, "kl": 0.00012179091572761536, "learning_rate": 1.557e-06, "loss": 0.0, "num_tokens": 162441.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 9.648148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.016889087855815887, "kl": 0.0012415878591127694, "learning_rate": 1.56e-06, "loss": 0.0001, "num_tokens": 162723.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 9.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 6.879021167755127, "kl": 0.0016143560060299933, "learning_rate": 1.5630000000000001e-06, "loss": -0.0411, "num_tokens": 162991.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.694362163543701, "kl": 0.0008695534197613597, "learning_rate": 1.5660000000000001e-06, "loss": 0.1415, "num_tokens": 163282.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 9.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 6.453367710113525, "kl": 0.002435041998978704, "learning_rate": 1.5690000000000001e-06, "loss": 0.4561, "num_tokens": 163554.0, "reward": 2.375, "reward_std": 1.8874585628509521, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.8874585628509521, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 9.722222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.02079276740550995, "kl": 0.001192980445921421, "learning_rate": 1.5720000000000002e-06, "loss": 0.0001, "num_tokens": 163882.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 9.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.8610076904296875, "kl": 0.0023750447726342827, "learning_rate": 1.5750000000000002e-06, "loss": 0.1223, "num_tokens": 164154.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 9.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.8834428787231445, "kl": 0.0032023880630731583, "learning_rate": 1.5780000000000002e-06, "loss": 0.0633, "num_tokens": 164499.0, "reward": 2.5, "reward_std": 1.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 9.777777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 3.78782320022583, "kl": 0.0025741655263118446, "learning_rate": 1.5810000000000002e-06, "loss": 0.0926, "num_tokens": 164869.0, "reward": 3.049999952316284, "reward_std": 0.33166250586509705, "rewards/reward_combined/mean": 3.049999952316284, "rewards/reward_combined/std": 0.33166247606277466, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 9.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.27029284834861755, "kl": 0.009208195144310594, "learning_rate": 1.5840000000000002e-06, "loss": 0.0005, "num_tokens": 165127.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 9.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.014770339243113995, "kl": 0.00014358200132846832, "learning_rate": 1.5870000000000002e-06, "loss": 0.0, "num_tokens": 165371.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 9.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0294018667191267, "kl": 0.0008226931531680748, "learning_rate": 1.59e-06, "loss": 0.0, "num_tokens": 165631.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 9.851851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.01135218609124422, "kl": 0.0005414411425590515, "learning_rate": 1.593e-06, "loss": 0.0, "num_tokens": 165841.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 57.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 9.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.112264156341553, "kl": 0.0026307841762900352, "learning_rate": 1.596e-06, "loss": 0.131, "num_tokens": 166285.0, "reward": 1.5499999523162842, "reward_std": 4.859012126922607, "rewards/reward_combined/mean": 1.5499999523162842, "rewards/reward_combined/std": 4.859012126922607, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.435405731201172, "kl": 0.021325815469026566, "learning_rate": 1.599e-06, "loss": -0.0024, "num_tokens": 166573.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 9.907407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.5232558250427246, "kl": 0.040265748859383166, "learning_rate": 1.602e-06, "loss": -0.152, "num_tokens": 166941.0, "reward": 2.25, "reward_std": 4.051748752593994, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 4.051748752593994, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 9.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.802764892578125, "kl": 0.010558367241173983, "learning_rate": 1.605e-06, "loss": 0.071, "num_tokens": 167204.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 9.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 4.850037097930908, "kl": 0.0031486726365983486, "learning_rate": 1.608e-06, "loss": 0.0171, "num_tokens": 167496.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 9.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.04560142755508423, "kl": 0.0008016876163310371, "learning_rate": 1.6110000000000001e-06, "loss": 0.0, "num_tokens": 167718.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 9.981481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 6.351009845733643, "kl": 0.005056664114817977, "learning_rate": 1.6140000000000001e-06, "loss": -0.0552, "num_tokens": 167984.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 10.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.612298965454102, "kl": 0.0034471265971660614, "learning_rate": 1.6170000000000001e-06, "loss": 0.1006, "num_tokens": 168326.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 10.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0329267680644989, "kl": 0.0011048528831452131, "learning_rate": 1.6200000000000002e-06, "loss": 0.0001, "num_tokens": 168630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 10.037037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 6.368251323699951, "kl": 0.00031063980713952333, "learning_rate": 1.6230000000000002e-06, "loss": 0.0412, "num_tokens": 168928.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 75.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 2.2446129322052, "kl": 0.002881466527469456, "learning_rate": 1.6260000000000002e-06, "loss": 0.4622, "num_tokens": 169451.0, "reward": 7.300000190734863, "reward_std": 0.40000009536743164, "rewards/reward_combined/mean": 7.300000190734863, "rewards/reward_combined/std": 0.40000009536743164, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 74.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 74.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 10.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.5643595457077026, "kl": 0.00021576111612375826, "learning_rate": 1.6290000000000002e-06, "loss": 0.4485, "num_tokens": 169973.0, "reward": 5.625, "reward_std": 4.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 4.75, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 10.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 7.54982328414917, "kl": 0.001731416559778154, "learning_rate": 1.6320000000000002e-06, "loss": 0.1921, "num_tokens": 170286.0, "reward": 0.25, "reward_std": 0.8660253882408142, "rewards/reward_combined/mean": 0.25, "rewards/reward_combined/std": 0.8660253882408142, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 10.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.2725299596786499, "kl": 0.012912555976072326, "learning_rate": 1.6350000000000002e-06, "loss": 0.0006, "num_tokens": 170544.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 10.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.035415973514318466, "kl": 0.0003276318311691284, "learning_rate": 1.6380000000000002e-06, "loss": 0.0, "num_tokens": 170754.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 10.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 3.413743257522583, "kl": 0.04540968965739012, "learning_rate": 1.6410000000000003e-06, "loss": 0.0366, "num_tokens": 171040.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 10.166666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03200657293200493, "kl": 0.001019798728520982, "learning_rate": 1.6440000000000003e-06, "loss": 0.0001, "num_tokens": 171307.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 10.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.4697763919830322, "kl": 0.0023434842005372047, "learning_rate": 1.647e-06, "loss": 0.1577, "num_tokens": 171669.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 10.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.13165704905986786, "kl": 0.019468783400952816, "learning_rate": 1.65e-06, "loss": 0.001, "num_tokens": 171963.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 10.222222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.029760370030999184, "kl": 0.0029473998583853245, "learning_rate": 1.653e-06, "loss": 0.0002, "num_tokens": 172323.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 14.334692001342773, "kl": 0.0014246180653572083, "learning_rate": 1.6560000000000001e-06, "loss": -0.1715, "num_tokens": 172563.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 10.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.9498836994171143, "kl": 0.0051345787942409515, "learning_rate": 1.6590000000000001e-06, "loss": -0.0792, "num_tokens": 172850.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 10.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.02657780796289444, "kl": 0.0018518269062042236, "learning_rate": 1.6620000000000001e-06, "loss": 0.0001, "num_tokens": 173062.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.734173059463501, "kl": 0.021748999832198024, "learning_rate": 1.6650000000000002e-06, "loss": 0.108, "num_tokens": 173417.0, "reward": 5.0, "reward_std": 3.5590262413024902, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.5590262413024902, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04306991398334503, "kl": 0.001214335861732252, "learning_rate": 1.6680000000000002e-06, "loss": 0.0001, "num_tokens": 173673.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 10.333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.013166028074920177, "kl": 0.0011980346753261983, "learning_rate": 1.6710000000000002e-06, "loss": 0.0001, "num_tokens": 173955.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 10.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017330129630863667, "kl": 7.345527410507202e-05, "learning_rate": 1.6740000000000002e-06, "loss": 0.0, "num_tokens": 174227.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.021276595070958138, "clip_ratio/high_mean": 0.021276595070958138, "clip_ratio/low_mean": 0.007692307699471712, "clip_ratio/low_min": 0.007692307699471712, "clip_ratio/region_mean": 0.02896890277042985, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 10.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.486509323120117, "kl": 0.0069011535961180925, "learning_rate": 1.6770000000000002e-06, "loss": 0.0458, "num_tokens": 174555.0, "reward": 2.75, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.190238118171692, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 67.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 67.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.8934012651443481, "kl": 0.0007226394955068827, "learning_rate": 1.6800000000000002e-06, "loss": 0.4866, "num_tokens": 175043.0, "reward": 2.549999952316284, "reward_std": 2.9000000953674316, "rewards/reward_combined/mean": 2.549999952316284, "rewards/reward_combined/std": 2.8999998569488525, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.011858646757900715, "kl": 0.0002608560025691986, "learning_rate": 1.6830000000000002e-06, "loss": 0.0, "num_tokens": 175255.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03355216234922409, "kl": 0.0005901605036342517, "learning_rate": 1.6860000000000002e-06, "loss": 0.0, "num_tokens": 175474.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.023276211693882942, "kl": 0.000884422188391909, "learning_rate": 1.6889999999999998e-06, "loss": 0.0, "num_tokens": 175747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.462962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 7.79395055770874, "kl": 0.0033861820120364428, "learning_rate": 1.6919999999999999e-06, "loss": 0.1825, "num_tokens": 176022.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 10.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05285676568746567, "kl": 0.0013320036232471466, "learning_rate": 1.6949999999999999e-06, "loss": 0.0001, "num_tokens": 176282.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 10.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.2973270416259766, "kl": 0.000938477780437097, "learning_rate": 1.6979999999999999e-06, "loss": -0.0155, "num_tokens": 176603.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 10.518518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.575317144393921, "kl": 0.0024951985105872154, "learning_rate": 1.7009999999999999e-06, "loss": 0.1103, "num_tokens": 176924.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 10.537037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 4.342179298400879, "kl": 0.0016367121716029942, "learning_rate": 1.704e-06, "loss": -0.0965, "num_tokens": 177218.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 10.555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 5.08101749420166, "kl": 0.0023249993100762367, "learning_rate": 1.707e-06, "loss": -0.2064, "num_tokens": 177566.0, "reward": 0.625, "reward_std": 0.25, "rewards/reward_combined/mean": 0.625, "rewards/reward_combined/std": 0.25, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 10.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.020238690078258514, "kl": 0.001071081671398133, "learning_rate": 1.71e-06, "loss": 0.0001, "num_tokens": 177844.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 10.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.5214464664459229, "kl": 0.0064206772949546576, "learning_rate": 1.713e-06, "loss": 0.0, "num_tokens": 178208.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 10.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.121870279312134, "kl": 0.0037148026167415082, "learning_rate": 1.716e-06, "loss": 0.1631, "num_tokens": 178573.0, "reward": 3.0, "reward_std": 3.316624879837036, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 3.316624879837036, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 10.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.407678127288818, "kl": 0.005521014798432589, "learning_rate": 1.719e-06, "loss": 0.2429, "num_tokens": 178923.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 10.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 5.755073547363281, "kl": 0.0037983747897669673, "learning_rate": 1.722e-06, "loss": -0.0337, "num_tokens": 179242.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 10.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.1125705242156982, "kl": 0.0015287879505194724, "learning_rate": 1.725e-06, "loss": 0.0503, "num_tokens": 179574.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 10.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.040618419647217, "kl": 0.003923992975614965, "learning_rate": 1.728e-06, "loss": 0.0309, "num_tokens": 179854.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 10.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.314767837524414, "kl": 0.005306090926751494, "learning_rate": 1.7309999999999998e-06, "loss": 0.0052, "num_tokens": 180217.0, "reward": 4.625, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 4.308422088623047, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 10.722222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.026743585243821144, "kl": 0.0008531607345503289, "learning_rate": 1.7339999999999998e-06, "loss": 0.0, "num_tokens": 180525.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.009803921915590763, "clip_ratio/low_min": 0.009803921915590763, "clip_ratio/region_mean": 0.02116755861788988, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 10.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.088840007781982, "kl": 0.0027468246989883482, "learning_rate": 1.7369999999999998e-06, "loss": 0.1058, "num_tokens": 180868.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00021816635853610933, "kl": 5.990266799926758e-06, "learning_rate": 1.7399999999999999e-06, "loss": 0.0, "num_tokens": 181088.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 10.777777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 4.219961166381836, "kl": 0.011513480450958014, "learning_rate": 1.7429999999999999e-06, "loss": -0.2091, "num_tokens": 181430.0, "reward": 0.0, "reward_std": 1.0, "rewards/reward_combined/mean": 0.0, "rewards/reward_combined/std": 1.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 10.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05167926847934723, "kl": 0.0028135766624473035, "learning_rate": 1.7459999999999999e-06, "loss": 0.0001, "num_tokens": 181736.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 10.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07724424451589584, "kl": 0.0018440705025568604, "learning_rate": 1.749e-06, "loss": 0.0001, "num_tokens": 181969.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.07867051661014557, "kl": 0.0016956999897956848, "learning_rate": 1.752e-06, "loss": 0.0001, "num_tokens": 182185.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 10.851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 6.244733810424805, "kl": 0.003998629283159971, "learning_rate": 1.755e-06, "loss": -0.0022, "num_tokens": 182484.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 10.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.1518163681030273, "kl": 0.0006980647303862497, "learning_rate": 1.758e-06, "loss": -0.0085, "num_tokens": 182815.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 10.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.010196794755756855, "kl": 0.0003098830402450403, "learning_rate": 1.761e-06, "loss": 0.0, "num_tokens": 183124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 10.907407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.224768161773682, "kl": 0.013283525127917528, "learning_rate": 1.764e-06, "loss": 0.3444, "num_tokens": 183425.0, "reward": 5.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.674234628677368, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 10.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.5237748622894287, "kl": 0.21871189028024673, "learning_rate": 1.767e-06, "loss": 0.0574, "num_tokens": 183687.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 10.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 3.8249011039733887, "kl": 0.002103843493387103, "learning_rate": 1.77e-06, "loss": -0.0624, "num_tokens": 184099.0, "reward": 0.800000011920929, "reward_std": 0.9626352787017822, "rewards/reward_combined/mean": 0.800000011920929, "rewards/reward_combined/std": 0.9626352787017822, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 10.962962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 7.345606327056885, "kl": 0.1526459683664143, "learning_rate": 1.773e-06, "loss": -0.0739, "num_tokens": 184397.0, "reward": 2.5, "reward_std": 4.3011627197265625, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 4.3011627197265625, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 10.981481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.04271714389324188, "kl": 0.001882628130260855, "learning_rate": 1.776e-06, "loss": 0.0001, "num_tokens": 184668.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.5479326248168945, "kl": 0.01541160186752677, "learning_rate": 1.779e-06, "loss": 0.0024, "num_tokens": 184956.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.008236559107899666, "kl": 0.00152616947889328, "learning_rate": 1.782e-06, "loss": 0.0001, "num_tokens": 185192.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.037037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 7.302337646484375, "kl": 0.0047858242760412395, "learning_rate": 1.785e-06, "loss": -0.0431, "num_tokens": 185487.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 11.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.033569321036338806, "kl": 0.0007271227514138445, "learning_rate": 1.7879999999999999e-06, "loss": 0.0, "num_tokens": 185721.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07298684120178223, "kl": 0.0019379467121325433, "learning_rate": 1.7909999999999999e-06, "loss": 0.0001, "num_tokens": 185975.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 11.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.13940539956092834, "kl": 0.007875355251599103, "learning_rate": 1.7939999999999999e-06, "loss": 0.0004, "num_tokens": 186245.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 11.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 5.558523178100586, "kl": 0.007870005210861564, "learning_rate": 1.797e-06, "loss": -0.1084, "num_tokens": 186561.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 11.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.3574469089508057, "kl": 0.0032532837940379977, "learning_rate": 1.8e-06, "loss": 0.1086, "num_tokens": 186918.0, "reward": 3.75, "reward_std": 2.723355770111084, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 2.723355770111084, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 11.148148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.04779445007443428, "kl": 0.003014355548657477, "learning_rate": 1.803e-06, "loss": 0.0002, "num_tokens": 187212.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 11.166666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.747562289237976, "kl": 0.009586491622030735, "learning_rate": 1.806e-06, "loss": -0.0665, "num_tokens": 187550.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06118657439947128, "kl": 0.0015967967920005322, "learning_rate": 1.809e-06, "loss": 0.0001, "num_tokens": 187780.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 11.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.004736695904284716, "kl": 0.00037567691470030695, "learning_rate": 1.812e-06, "loss": 0.0, "num_tokens": 188144.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 11.222222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 5.6015238761901855, "kl": 0.0037239082157611847, "learning_rate": 1.815e-06, "loss": 0.0981, "num_tokens": 188453.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 11.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.6835286617279053, "kl": 0.006729712942615151, "learning_rate": 1.818e-06, "loss": 0.0942, "num_tokens": 188797.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.394619941711426, "kl": 0.005084015661850572, "learning_rate": 1.821e-06, "loss": 0.0127, "num_tokens": 189074.0, "reward": 4.375, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 3.902456521987915, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008474576286971569, "clip_ratio/low_min": 0.008474576286971569, "clip_ratio/region_mean": 0.008474576286971569, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 11.277777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 4.031157493591309, "kl": 0.0069244245532900095, "learning_rate": 1.824e-06, "loss": 0.1142, "num_tokens": 189407.0, "reward": 1.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 11.296296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.14288270473480225, "kl": 0.007235784083604813, "learning_rate": 1.827e-06, "loss": 0.0004, "num_tokens": 189705.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.042842842638492584, "kl": 0.0003638714551925659, "learning_rate": 1.83e-06, "loss": 0.0, "num_tokens": 189917.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 6.656713485717773, "kl": 0.00391710945405066, "learning_rate": 1.833e-06, "loss": -0.1864, "num_tokens": 190207.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.1138060986995697, "kl": 0.01024311501532793, "learning_rate": 1.836e-06, "loss": 0.0005, "num_tokens": 190503.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 11.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03635697811841965, "kl": 0.001109056938730646, "learning_rate": 1.839e-06, "loss": 0.0001, "num_tokens": 190832.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01670781336724758, "kl": 0.002560785796958953, "learning_rate": 1.8420000000000001e-06, "loss": 0.0001, "num_tokens": 191102.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.01666666753590107, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01666666753590107, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 11.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 8.373130798339844, "kl": 0.00798106868751347, "learning_rate": 1.8450000000000001e-06, "loss": -0.268, "num_tokens": 191424.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00027208359097130597, "kl": 8.612871170043945e-06, "learning_rate": 1.848e-06, "loss": 0.0, "num_tokens": 191644.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0567723885178566, "kl": 0.004504364216700196, "learning_rate": 1.851e-06, "loss": 0.0002, "num_tokens": 191946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 11.462962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.007303483784198761, "kl": 0.015217685140669346, "learning_rate": 1.854e-06, "loss": 0.0008, "num_tokens": 192206.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 11.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.1344587653875351, "kl": 0.00376090407371521, "learning_rate": 1.857e-06, "loss": 0.0002, "num_tokens": 192422.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 11.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.7340598106384277, "kl": 0.001701810397207737, "learning_rate": 1.86e-06, "loss": 0.0405, "num_tokens": 192751.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 11.518518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 5.458225250244141, "kl": 0.004642652929760516, "learning_rate": 1.863e-06, "loss": -0.0358, "num_tokens": 193027.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.537037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.03666996210813522, "kl": 0.0018023437005467713, "learning_rate": 1.866e-06, "loss": 0.0001, "num_tokens": 193301.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 11.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014777244068682194, "kl": 8.708939640200697e-05, "learning_rate": 1.869e-06, "loss": 0.0, "num_tokens": 193609.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 11.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.767086982727051, "kl": 0.004180784453637898, "learning_rate": 1.872e-06, "loss": 0.153, "num_tokens": 193948.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 11.592592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05402153730392456, "kl": 0.0068692793138325214, "learning_rate": 1.875e-06, "loss": 0.0003, "num_tokens": 194245.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 11.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0920700952410698, "kl": 0.0013527125120162964, "learning_rate": 1.878e-06, "loss": 0.0001, "num_tokens": 194455.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 11.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.006711984518915415, "kl": 0.0010253533837385476, "learning_rate": 1.881e-06, "loss": 0.0001, "num_tokens": 194715.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 11.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 2.964512825012207, "kl": 0.009294234216213226, "learning_rate": 1.884e-06, "loss": -0.0983, "num_tokens": 195069.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 1.4361406564712524, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 11.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 4.334890842437744, "kl": 0.0058948209043592215, "learning_rate": 1.887e-06, "loss": 0.0057, "num_tokens": 195379.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.75, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 42.75, "completions/mean_terminated_length": 42.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 11.685185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751858800649643, "kl": 0.008289248682558537, "learning_rate": 1.8900000000000001e-06, "loss": 0.0004, "num_tokens": 195774.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.703703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.11111831665039062, "kl": 0.0027627437375485897, "learning_rate": 1.8930000000000001e-06, "loss": 0.0001, "num_tokens": 195993.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.722222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 3.696523427963257, "kl": 0.004287190269678831, "learning_rate": 1.8960000000000001e-06, "loss": -0.0525, "num_tokens": 196302.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10204889625310898, "kl": 0.017444612458348274, "learning_rate": 1.8990000000000002e-06, "loss": 0.0009, "num_tokens": 196594.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 11.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.004702435340732336, "kl": 0.00022815167903900146, "learning_rate": 1.9020000000000002e-06, "loss": 0.0, "num_tokens": 196838.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 11.777777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.06758100539445877, "kl": 0.004599316511303186, "learning_rate": 1.905e-06, "loss": 0.0002, "num_tokens": 197147.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 11.796296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.605614185333252, "kl": 0.0018278235220350325, "learning_rate": 1.908e-06, "loss": 0.1203, "num_tokens": 197435.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 11.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.515329837799072, "kl": 0.03643876314163208, "learning_rate": 1.911e-06, "loss": 0.1542, "num_tokens": 197745.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 11.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.05337267369031906, "kl": 0.001255527138710022, "learning_rate": 1.9140000000000002e-06, "loss": 0.0001, "num_tokens": 198005.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 11.851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 1.2822935581207275, "kl": 0.0013061455101706088, "learning_rate": 1.917e-06, "loss": -0.0226, "num_tokens": 198434.0, "reward": 1.2999999523162842, "reward_std": 1.536229133605957, "rewards/reward_combined/mean": 1.2999999523162842, "rewards/reward_combined/std": 1.536229133605957, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.060271814465522766, "kl": 0.0039077382534742355, "learning_rate": 1.9200000000000003e-06, "loss": 0.0002, "num_tokens": 198715.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 11.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.007071491796523333, "kl": 0.00014022439427208155, "learning_rate": 1.923e-06, "loss": 0.0, "num_tokens": 198971.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.907407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 4.723827838897705, "kl": 0.03742720186710358, "learning_rate": 1.9260000000000003e-06, "loss": 0.2653, "num_tokens": 199258.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 11.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.672297239303589, "kl": 0.0019956419564550743, "learning_rate": 1.929e-06, "loss": 0.0335, "num_tokens": 199540.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 11.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 7.243220806121826, "kl": 0.01041489327326417, "learning_rate": 1.9320000000000003e-06, "loss": 0.0625, "num_tokens": 199863.0, "reward": 2.375, "reward_std": 3.75, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 3.75, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 11.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.09201733767986298, "kl": 0.01014986983500421, "learning_rate": 1.935e-06, "loss": 0.0005, "num_tokens": 200185.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 11.981481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05975935608148575, "kl": 0.006647712318226695, "learning_rate": 1.938e-06, "loss": 0.0003, "num_tokens": 200518.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 12.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04419155418872833, "kl": 0.0016164439875865355, "learning_rate": 1.941e-06, "loss": 0.0001, "num_tokens": 200782.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 12.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03825339302420616, "kl": 0.0024186375085264444, "learning_rate": 1.944e-06, "loss": 0.0001, "num_tokens": 201079.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 12.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.2707661986351013, "kl": 0.012258023954927921, "learning_rate": 1.947e-06, "loss": 0.0006, "num_tokens": 201337.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 12.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.10657081753015518, "kl": 0.005779258208349347, "learning_rate": 1.95e-06, "loss": 0.0003, "num_tokens": 201635.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 12.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.175602912902832, "kl": 0.00963138323277235, "learning_rate": 1.953e-06, "loss": 0.0012, "num_tokens": 201916.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 12.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.007182702422142029, "kl": 0.015305588487535715, "learning_rate": 1.956e-06, "loss": 0.0008, "num_tokens": 202176.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.030040180310606956, "kl": 0.0013423172640614212, "learning_rate": 1.9590000000000002e-06, "loss": 0.0001, "num_tokens": 202450.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 12.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.954211235046387, "kl": 0.008251628605648875, "learning_rate": 1.962e-06, "loss": -0.0366, "num_tokens": 202732.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 12.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 2.9378139972686768, "kl": 0.007347192615270615, "learning_rate": 1.9650000000000002e-06, "loss": -0.2469, "num_tokens": 203092.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 12.166666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.04142593964934349, "kl": 0.0014567188918590546, "learning_rate": 1.968e-06, "loss": 0.0001, "num_tokens": 203352.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 12.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030019355472177267, "kl": 0.00010986030247295275, "learning_rate": 1.9710000000000003e-06, "loss": 0.0, "num_tokens": 203620.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 12.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.001762113068252802, "kl": 6.105055217631161e-05, "learning_rate": 1.974e-06, "loss": 0.0, "num_tokens": 203931.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 12.222222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 9.4471435546875, "kl": 0.024239951744675636, "learning_rate": 1.9770000000000003e-06, "loss": 0.0567, "num_tokens": 204168.0, "reward": 3.375, "reward_std": 1.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 1.25, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 12.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.006828906014561653, "kl": 0.0007608592859469354, "learning_rate": 1.98e-06, "loss": 0.0, "num_tokens": 204428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 12.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069746170192956924, "kl": 0.00021936596021987498, "learning_rate": 1.9830000000000003e-06, "loss": 0.0, "num_tokens": 204747.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.13971294462680817, "kl": 0.007924544624984264, "learning_rate": 1.986e-06, "loss": 0.0004, "num_tokens": 205047.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 12.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.632049560546875, "kl": 0.007800285937264562, "learning_rate": 1.9890000000000004e-06, "loss": -0.0547, "num_tokens": 205363.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.013179701752960682, "kl": 0.002018794766627252, "learning_rate": 1.992e-06, "loss": 0.0001, "num_tokens": 205645.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 12.333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 3.3997981548309326, "kl": 0.003044098149985075, "learning_rate": 1.995e-06, "loss": 0.0592, "num_tokens": 205979.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.01910579577088356, "kl": 0.002425070386379957, "learning_rate": 1.998e-06, "loss": 0.0001, "num_tokens": 206249.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 12.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.639466285705566, "kl": 0.020511489361524582, "learning_rate": 2.001e-06, "loss": 0.0638, "num_tokens": 206593.0, "reward": 2.125, "reward_std": 3.9449334144592285, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 3.9449334144592285, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 12.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.862105369567871, "kl": 0.03912351280450821, "learning_rate": 2.004e-06, "loss": 0.1246, "num_tokens": 206900.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 12.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 1.168433427810669, "kl": 0.002303434011992067, "learning_rate": 2.007e-06, "loss": -0.0629, "num_tokens": 207319.0, "reward": 0.675000011920929, "reward_std": 1.0436315536499023, "rewards/reward_combined/mean": 0.675000011920929, "rewards/reward_combined/std": 1.0436315536499023, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 12.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.924370527267456, "kl": 0.00353796174749732, "learning_rate": 2.0100000000000002e-06, "loss": 0.0053, "num_tokens": 207621.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 12.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 3.1171138286590576, "kl": 0.0020666478958446532, "learning_rate": 2.013e-06, "loss": -0.0387, "num_tokens": 207989.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 12.462962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 2.6609792709350586, "kl": 0.7828765045851469, "learning_rate": 2.0160000000000003e-06, "loss": 0.083, "num_tokens": 208275.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 12.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05333259701728821, "kl": 0.0006788596510887146, "learning_rate": 2.019e-06, "loss": 0.0, "num_tokens": 208481.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.12892459332942963, "kl": 0.014746975619345903, "learning_rate": 2.0220000000000003e-06, "loss": 0.0007, "num_tokens": 208749.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 12.518518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 5.480861663818359, "kl": 0.015723032876849174, "learning_rate": 2.025e-06, "loss": 0.0323, "num_tokens": 209049.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 12.537037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 3.6830129623413086, "kl": 0.0030975337140262127, "learning_rate": 2.0280000000000003e-06, "loss": 0.0279, "num_tokens": 209378.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 12.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.05692094564437866, "kl": 0.0045005188876530156, "learning_rate": 2.031e-06, "loss": 0.0004, "num_tokens": 209719.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 12.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011465778516139835, "kl": 2.5704503059387207e-06, "learning_rate": 2.0340000000000003e-06, "loss": 0.0, "num_tokens": 209939.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 12.592592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06668251752853394, "kl": 0.008849663892760873, "learning_rate": 2.037e-06, "loss": 0.0005, "num_tokens": 210277.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 12.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.18250074982643127, "kl": 0.007448920048773289, "learning_rate": 2.0400000000000004e-06, "loss": 0.0004, "num_tokens": 210504.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007042253389954567, "clip_ratio/low_min": 0.007042253389954567, "clip_ratio/region_mean": 0.007042253389954567, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 12.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.01829195022583, "kl": 0.013138486538082361, "learning_rate": 2.043e-06, "loss": -0.1311, "num_tokens": 210859.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 12.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 3.4903883934020996, "kl": 0.011826877947896719, "learning_rate": 2.0460000000000004e-06, "loss": 0.0819, "num_tokens": 211183.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 12.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 4.724998950958252, "kl": 0.01222726097330451, "learning_rate": 2.049e-06, "loss": 0.0461, "num_tokens": 211526.0, "reward": 5.0, "reward_std": 3.5590262413024902, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.5590262413024902, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 12.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.051443099975586, "kl": 0.015238044783473015, "learning_rate": 2.052e-06, "loss": 0.0276, "num_tokens": 211869.0, "reward": 2.75, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.190238118171692, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 12.703703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.032885242253541946, "kl": 0.0011593550152610987, "learning_rate": 2.0550000000000002e-06, "loss": 0.0001, "num_tokens": 212137.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 12.722222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.014226168394088745, "kl": 0.0033940672874450684, "learning_rate": 2.058e-06, "loss": 0.0002, "num_tokens": 212417.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 12.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.164578914642334, "kl": 0.007152646780014038, "learning_rate": 2.0610000000000003e-06, "loss": 0.0035, "num_tokens": 212753.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 12.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.5273241996765137, "kl": 0.014231080407625996, "learning_rate": 2.064e-06, "loss": -0.1156, "num_tokens": 213069.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 12.777777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.01314019225537777, "kl": 0.00022470951080322266, "learning_rate": 2.067e-06, "loss": 0.0, "num_tokens": 213281.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 12.796296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.443144798278809, "kl": 0.0004062642801727634, "learning_rate": 2.07e-06, "loss": 0.0468, "num_tokens": 213502.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 12.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03304954990744591, "kl": 0.00044177046220283955, "learning_rate": 2.073e-06, "loss": 0.0, "num_tokens": 213736.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 12.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01379492785781622, "kl": 0.0003537073644110933, "learning_rate": 2.0759999999999997e-06, "loss": 0.0, "num_tokens": 213992.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 12.851851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.03360677883028984, "kl": 0.0028873877599835396, "learning_rate": 2.079e-06, "loss": 0.0001, "num_tokens": 214304.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 12.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05105031281709671, "kl": 0.001215046620927751, "learning_rate": 2.0819999999999997e-06, "loss": 0.0001, "num_tokens": 214564.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.010638297535479069, "clip_ratio/high_mean": 0.010638297535479069, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010638297535479069, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 12.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.598762512207031, "kl": 0.012793307425454259, "learning_rate": 2.085e-06, "loss": 0.1309, "num_tokens": 214877.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 12.907407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.21235185861587524, "kl": 0.029944309033453465, "learning_rate": 2.0879999999999997e-06, "loss": 0.0011, "num_tokens": 215207.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 12.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.3441944420337677, "kl": 0.0203075148165226, "learning_rate": 2.091e-06, "loss": 0.0012, "num_tokens": 215454.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 4.609631538391113, "kl": 0.03404449298977852, "learning_rate": 2.0939999999999998e-06, "loss": -0.109, "num_tokens": 215744.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 12.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.032854482531547546, "kl": 0.0029180049896240234, "learning_rate": 2.097e-06, "loss": 0.0001, "num_tokens": 215956.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 12.981481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 4.977245807647705, "kl": 0.013467305339872837, "learning_rate": 2.1e-06, "loss": 0.0012, "num_tokens": 216300.0, "reward": 3.0, "reward_std": 3.535533905029297, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 3.535533905029297, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 13.0, "frac_reward_zero_std": 0.0, "grad_norm": 7.03212833404541, "kl": 0.008037997176870704, "learning_rate": 2.103e-06, "loss": -0.098, "num_tokens": 216612.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 13.018518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.9668593406677246, "kl": 0.011052212677896023, "learning_rate": 2.106e-06, "loss": 0.0269, "num_tokens": 216928.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 13.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.031044969335198402, "kl": 0.003113462822511792, "learning_rate": 2.109e-06, "loss": 0.0002, "num_tokens": 217260.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 62.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 62.0, "completions/mean_terminated_length": 62.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 13.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 4.012850284576416, "kl": 0.005787973990663886, "learning_rate": 2.112e-06, "loss": 0.3668, "num_tokens": 217728.0, "reward": 0.5, "reward_std": 1.0, "rewards/reward_combined/mean": 0.5, "rewards/reward_combined/std": 1.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 65.5, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 65.5, "completions/mean_terminated_length": 65.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 13.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.181117296218872, "kl": 0.013391206972301006, "learning_rate": 2.115e-06, "loss": 0.3524, "num_tokens": 218242.0, "reward": -0.07500004768371582, "reward_std": 3.053277015686035, "rewards/reward_combined/mean": -0.07500004768371582, "rewards/reward_combined/std": 3.053277015686035, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 13.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.07046175748109818, "kl": 0.02439271006733179, "learning_rate": 2.118e-06, "loss": 0.0012, "num_tokens": 218542.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 13.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1288159340620041, "kl": 0.0032868816051632166, "learning_rate": 2.121e-06, "loss": 0.0002, "num_tokens": 218806.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 13.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.7185447216033936, "kl": 0.017199629452079535, "learning_rate": 2.124e-06, "loss": 0.0385, "num_tokens": 219094.0, "reward": 5.0, "reward_std": 3.5590262413024902, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.5590262413024902, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 13.148148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.03423300385475159, "kl": 0.004782620584592223, "learning_rate": 2.127e-06, "loss": 0.0002, "num_tokens": 219368.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 13.166666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.13810978829860687, "kl": 0.012777527328580618, "learning_rate": 2.13e-06, "loss": 0.0007, "num_tokens": 219642.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0054832505993545055, "kl": 0.00024031996872508898, "learning_rate": 2.133e-06, "loss": 0.0, "num_tokens": 219902.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 13.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04271876811981201, "kl": 0.0008862614631652832, "learning_rate": 2.136e-06, "loss": 0.0, "num_tokens": 220110.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.222222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.028154635801911354, "kl": 0.0006726061401423067, "learning_rate": 2.1389999999999998e-06, "loss": 0.0, "num_tokens": 220378.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 13.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005581853445619345, "kl": 0.0025982260704040527, "learning_rate": 2.142e-06, "loss": 0.0001, "num_tokens": 220614.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07566168159246445, "kl": 0.0019389942463021725, "learning_rate": 2.145e-06, "loss": 0.0001, "num_tokens": 220870.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 13.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.005097101908177137, "kl": 0.0002322739819646813, "learning_rate": 2.148e-06, "loss": 0.0, "num_tokens": 221186.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 13.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.316486358642578, "kl": 0.01027616742067039, "learning_rate": 2.151e-06, "loss": 0.0341, "num_tokens": 221542.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 13.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.16810569167137146, "kl": 0.019446187652647495, "learning_rate": 2.154e-06, "loss": 0.001, "num_tokens": 221910.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 13.333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 6.1822686195373535, "kl": 0.0010762370002339594, "learning_rate": 2.157e-06, "loss": -0.0719, "num_tokens": 222221.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 13.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.017959382385015488, "kl": 0.015388155821710825, "learning_rate": 2.16e-06, "loss": 0.0008, "num_tokens": 222505.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.2283525466918945, "kl": 0.005473986966535449, "learning_rate": 2.163e-06, "loss": -0.185, "num_tokens": 222783.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01147428434342146, "kl": 0.0003103732888121158, "learning_rate": 2.166e-06, "loss": 0.0, "num_tokens": 223043.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 13.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 3.5182278156280518, "kl": 0.007994799176231027, "learning_rate": 2.169e-06, "loss": 0.1362, "num_tokens": 223377.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 13.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.15986865758895874, "kl": 0.02012438978999853, "learning_rate": 2.172e-06, "loss": 0.001, "num_tokens": 223672.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 13.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0555325485765934, "kl": 0.0026182486035395414, "learning_rate": 2.175e-06, "loss": 0.0001, "num_tokens": 223970.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 13.462962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.002697804942727089, "kl": 9.336024595540948e-05, "learning_rate": 2.178e-06, "loss": 0.0, "num_tokens": 224189.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 13.481481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 4.931563854217529, "kl": 0.012667708564549685, "learning_rate": 2.181e-06, "loss": 0.056, "num_tokens": 224513.0, "reward": 1.625, "reward_std": 2.3935678005218506, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 2.3935678005218506, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 13.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.12529534101486206, "kl": 0.004890830256044865, "learning_rate": 2.184e-06, "loss": 0.0002, "num_tokens": 224839.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 13.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.06014099717140198, "kl": 0.006085360422730446, "learning_rate": 2.187e-06, "loss": 0.0003, "num_tokens": 225172.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 13.537037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.011584106832742691, "kl": 0.0003792308270931244, "learning_rate": 2.19e-06, "loss": 0.0, "num_tokens": 225432.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 13.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0322616845369339, "kl": 0.0035367043456062675, "learning_rate": 2.193e-06, "loss": 0.0002, "num_tokens": 225714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.08717934042215347, "kl": 0.004229001933708787, "learning_rate": 2.196e-06, "loss": 0.0002, "num_tokens": 226018.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 12.425130844116211, "kl": 0.007111624465323985, "learning_rate": 2.199e-06, "loss": -0.0011, "num_tokens": 226288.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 13.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.041111864149570465, "kl": 0.003297999035567045, "learning_rate": 2.202e-06, "loss": 0.0001, "num_tokens": 226611.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 13.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.724043607711792, "kl": 0.0074762695003300905, "learning_rate": 2.205e-06, "loss": -0.0028, "num_tokens": 226930.0, "reward": 4.25, "reward_std": 3.752776622772217, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 3.752776861190796, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 13.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 3.7709622383117676, "kl": 0.04449963755905628, "learning_rate": 2.208e-06, "loss": 0.2254, "num_tokens": 227278.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 13.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.8305257558822632, "kl": 0.0004713718080893159, "learning_rate": 2.211e-06, "loss": -0.0214, "num_tokens": 227644.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 13.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.893324375152588, "kl": 0.005538077675737441, "learning_rate": 2.214e-06, "loss": 0.0954, "num_tokens": 227917.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 739 }, { "clip_ratio/high_max": 0.006493506487458944, "clip_ratio/high_mean": 0.006493506487458944, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006493506487458944, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 13.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.699036121368408, "kl": 0.01296887407079339, "learning_rate": 2.217e-06, "loss": -0.0991, "num_tokens": 228285.0, "reward": 2.875, "reward_std": 2.3935678005218506, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 2.3935678005218506, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.722222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 8.624296188354492, "kl": 0.019901788793504238, "learning_rate": 2.22e-06, "loss": 0.0198, "num_tokens": 228546.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 13.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03395688533782959, "kl": 0.015439601615071297, "learning_rate": 2.223e-06, "loss": 0.0008, "num_tokens": 228843.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 13.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04887813702225685, "kl": 0.0015638243203284219, "learning_rate": 2.226e-06, "loss": 0.0001, "num_tokens": 229077.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 13.777777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.07996886223554611, "kl": 0.007422972586937249, "learning_rate": 2.229e-06, "loss": 0.0004, "num_tokens": 229361.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 13.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.10441425442695618, "kl": 0.008619187399744987, "learning_rate": 2.232e-06, "loss": 0.0004, "num_tokens": 229654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 13.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 12.263534545898438, "kl": 0.00315009499900043, "learning_rate": 2.235e-06, "loss": 0.3098, "num_tokens": 229883.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 13.833333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 6.224886894226074, "kl": 0.003548555658198893, "learning_rate": 2.238e-06, "loss": -0.0285, "num_tokens": 230233.0, "reward": 1.5, "reward_std": 2.2730302810668945, "rewards/reward_combined/mean": 1.5, "rewards/reward_combined/std": 2.2730302810668945, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.25, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 13.851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 2.4481959342956543, "kl": 0.02331713866442442, "learning_rate": 2.2410000000000002e-06, "loss": 0.2235, "num_tokens": 230686.0, "reward": 3.5, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 3.674234628677368, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 13.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015618541510775685, "kl": 2.5704503059387207e-06, "learning_rate": 2.244e-06, "loss": 0.0, "num_tokens": 230906.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 13.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.6655162572860718, "kl": 0.003726708237081766, "learning_rate": 2.2470000000000003e-06, "loss": -0.0406, "num_tokens": 231326.0, "reward": 1.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 1.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 13.907407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 6.533583641052246, "kl": 0.03191567026078701, "learning_rate": 2.25e-06, "loss": -0.0356, "num_tokens": 231621.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 13.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.010266447439789772, "kl": 0.014777874108403921, "learning_rate": 2.253e-06, "loss": 0.0007, "num_tokens": 231881.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 13.944444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.044206928461790085, "kl": 0.004948943882482126, "learning_rate": 2.256e-06, "loss": 0.0002, "num_tokens": 232176.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 13.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.004614643286913633, "kl": 0.00020968914031982422, "learning_rate": 2.259e-06, "loss": 0.0, "num_tokens": 232420.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 13.981481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.12243378907442093, "kl": 0.0037477342411875725, "learning_rate": 2.262e-06, "loss": 0.0002, "num_tokens": 232634.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 14.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.030719023197889328, "kl": 0.0016948133707046509, "learning_rate": 2.265e-06, "loss": 0.0001, "num_tokens": 232846.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 14.018518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.0183265209198, "kl": 0.005362946190871298, "learning_rate": 2.268e-06, "loss": 0.0126, "num_tokens": 233180.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 14.037037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 4.9046735763549805, "kl": 0.025655806064605713, "learning_rate": 2.271e-06, "loss": 0.0606, "num_tokens": 233525.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 14.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.005016519222408533, "kl": 0.002717442810535431, "learning_rate": 2.274e-06, "loss": 0.0001, "num_tokens": 233761.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 14.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 7.308925151824951, "kl": 0.002663616935024038, "learning_rate": 2.277e-06, "loss": 0.0167, "num_tokens": 234076.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 14.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 6.024909019470215, "kl": 0.7515916135162115, "learning_rate": 2.28e-06, "loss": -0.0305, "num_tokens": 234370.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 14.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08666858077049255, "kl": 0.0051762983202934265, "learning_rate": 2.283e-06, "loss": 0.0003, "num_tokens": 234630.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 14.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.029259422793984413, "kl": 0.002253405749797821, "learning_rate": 2.2860000000000002e-06, "loss": 0.0001, "num_tokens": 234944.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 14.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 1.6150531768798828, "kl": 0.005932063329964876, "learning_rate": 2.289e-06, "loss": 0.0327, "num_tokens": 235357.0, "reward": 0.875, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 0.875, "rewards/reward_combined/std": 1.4361406564712524, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 14.166666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 4.03399658203125, "kl": 0.015308836940675974, "learning_rate": 2.2920000000000002e-06, "loss": -0.0414, "num_tokens": 235706.0, "reward": 3.25, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 0.28867512941360474, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 14.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.235726356506348, "kl": 0.018397110048681498, "learning_rate": 2.295e-06, "loss": -0.0134, "num_tokens": 236039.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 14.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.026918374001979828, "kl": 0.001450765848858282, "learning_rate": 2.2980000000000003e-06, "loss": 0.0001, "num_tokens": 236301.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 14.222222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 5.462699890136719, "kl": 0.007106927805580199, "learning_rate": 2.301e-06, "loss": -0.0021, "num_tokens": 236625.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 14.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.8901407718658447, "kl": 0.0013421766343526542, "learning_rate": 2.3040000000000003e-06, "loss": -0.0006, "num_tokens": 236900.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 14.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03274410590529442, "kl": 0.002124100923538208, "learning_rate": 2.307e-06, "loss": 0.0001, "num_tokens": 237112.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 14.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.014366337098181248, "kl": 0.00023959364625625312, "learning_rate": 2.31e-06, "loss": 0.0, "num_tokens": 237348.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 14.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.9902851581573486, "kl": 0.035965283401310444, "learning_rate": 2.313e-06, "loss": -0.0236, "num_tokens": 237691.0, "reward": 4.125, "reward_std": 2.25, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 2.25, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 14.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04854524880647659, "kl": 0.0013521099463105202, "learning_rate": 2.316e-06, "loss": 0.0001, "num_tokens": 237948.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 14.333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 5.973568916320801, "kl": 0.0028942684293724597, "learning_rate": 2.319e-06, "loss": 0.0326, "num_tokens": 238230.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 14.351851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 5.36252498626709, "kl": 0.04083090089261532, "learning_rate": 2.322e-06, "loss": 0.3214, "num_tokens": 238591.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 14.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.025457859039307, "kl": 0.0012301181559450924, "learning_rate": 2.325e-06, "loss": 0.1834, "num_tokens": 238874.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 14.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.516607761383057, "kl": 0.06527543067932129, "learning_rate": 2.328e-06, "loss": 0.1089, "num_tokens": 239186.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 14.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.00022976483160164207, "kl": 5.662441253662109e-06, "learning_rate": 2.3310000000000002e-06, "loss": 0.0, "num_tokens": 239406.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 14.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.25756582617759705, "kl": 0.03808259125798941, "learning_rate": 2.334e-06, "loss": 0.002, "num_tokens": 239709.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 14.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 4.000672817230225, "kl": 0.011569038964807987, "learning_rate": 2.3370000000000002e-06, "loss": 0.0643, "num_tokens": 239997.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 780 }, { "clip_ratio/high_max": 0.01984127052128315, "clip_ratio/high_mean": 0.01984127052128315, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01984127052128315, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 14.462962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 6.125977993011475, "kl": 0.02409778255969286, "learning_rate": 2.34e-06, "loss": -0.335, "num_tokens": 240318.0, "reward": 3.375, "reward_std": 0.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 0.25, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 14.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.15390489995479584, "kl": 0.01383195398375392, "learning_rate": 2.3430000000000003e-06, "loss": 0.0007, "num_tokens": 240634.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 14.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.1950475573539734, "kl": 0.011463714996352792, "learning_rate": 2.346e-06, "loss": 0.0006, "num_tokens": 240902.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 14.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.01879250258207321, "kl": 0.002030833507888019, "learning_rate": 2.3490000000000003e-06, "loss": 0.0001, "num_tokens": 241268.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 14.537037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.00866797473281622, "kl": 0.00024850977933965623, "learning_rate": 2.352e-06, "loss": 0.0, "num_tokens": 241585.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 14.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.029151448979973793, "kl": 0.0009359948744531721, "learning_rate": 2.3550000000000003e-06, "loss": 0.0, "num_tokens": 241892.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 14.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09562128037214279, "kl": 0.007301156176254153, "learning_rate": 2.358e-06, "loss": 0.0004, "num_tokens": 242177.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 14.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 5.764527797698975, "kl": 0.04302992485463619, "learning_rate": 2.3610000000000003e-06, "loss": -0.1232, "num_tokens": 242472.0, "reward": 3.875, "reward_std": 2.9545164108276367, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 2.9545164108276367, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 14.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00547375250607729, "kl": 0.00022152662131702527, "learning_rate": 2.364e-06, "loss": 0.0, "num_tokens": 242692.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 14.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.084255218505859, "kl": 0.019813910126686096, "learning_rate": 2.367e-06, "loss": 0.1207, "num_tokens": 243025.0, "reward": 4.125, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.308422088623047, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 14.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 3.163844347000122, "kl": 0.01603899523615837, "learning_rate": 2.37e-06, "loss": -0.0168, "num_tokens": 243351.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 14.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.1018881797790527, "kl": 0.017605592496693134, "learning_rate": 2.373e-06, "loss": -0.0646, "num_tokens": 243704.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 14.685185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01398887112736702, "kl": 0.001736477017402649, "learning_rate": 2.376e-06, "loss": 0.0001, "num_tokens": 243920.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 14.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 6.382302761077881, "kl": 0.04818444326519966, "learning_rate": 2.379e-06, "loss": -0.0153, "num_tokens": 244215.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 14.722222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 7.991127967834473, "kl": 0.014985023532062769, "learning_rate": 2.3820000000000002e-06, "loss": 0.1454, "num_tokens": 244483.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 14.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.7530293464660645, "kl": 0.010870927944779396, "learning_rate": 2.385e-06, "loss": 0.1486, "num_tokens": 244785.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 14.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.771587371826172, "kl": 0.004894593148492277, "learning_rate": 2.3880000000000003e-06, "loss": 0.021, "num_tokens": 245070.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 14.777777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 5.494588375091553, "kl": 0.008534710621461272, "learning_rate": 2.391e-06, "loss": -0.1524, "num_tokens": 245352.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 14.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.005379894282668829, "kl": 0.01579880900681019, "learning_rate": 2.3940000000000003e-06, "loss": 0.0008, "num_tokens": 245612.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 14.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 7.056868553161621, "kl": 0.008523806929588318, "learning_rate": 2.397e-06, "loss": 0.0612, "num_tokens": 245911.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 14.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.14603033661842346, "kl": 0.004562627989798784, "learning_rate": 2.4000000000000003e-06, "loss": 0.0002, "num_tokens": 246156.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 68.25, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 68.25, "completions/mean_terminated_length": 68.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 14.851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 2.5938432216644287, "kl": 0.011430806946009398, "learning_rate": 2.403e-06, "loss": 0.46, "num_tokens": 246649.0, "reward": 2.799999952316284, "reward_std": 1.399999976158142, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 1.399999976158142, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 14.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.2324957847595215, "kl": 0.004533653263933957, "learning_rate": 2.4060000000000003e-06, "loss": -0.0011, "num_tokens": 246941.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 14.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 9.289831161499023, "kl": 0.008422995451837778, "learning_rate": 2.409e-06, "loss": -0.0143, "num_tokens": 247212.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 14.907407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07687978446483612, "kl": 0.007713424973189831, "learning_rate": 2.4120000000000004e-06, "loss": 0.0004, "num_tokens": 247537.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 14.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07869924604892731, "kl": 0.0005496889352798462, "learning_rate": 2.415e-06, "loss": 0.0, "num_tokens": 247749.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 14.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 7.808557987213135, "kl": 0.023773484863340855, "learning_rate": 2.4180000000000004e-06, "loss": 0.1667, "num_tokens": 248027.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 14.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.019696783274412155, "kl": 0.0003880545264109969, "learning_rate": 2.421e-06, "loss": 0.0, "num_tokens": 248295.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 14.981481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.5050677061080933, "kl": 0.02722766832448542, "learning_rate": 2.4240000000000004e-06, "loss": 0.0012, "num_tokens": 248571.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 15.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.17297464609146118, "kl": 0.006345292087644339, "learning_rate": 2.4270000000000002e-06, "loss": 0.0004, "num_tokens": 248781.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 15.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.012793388217687607, "kl": 0.0004118494689464569, "learning_rate": 2.43e-06, "loss": 0.0, "num_tokens": 249041.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 15.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.05514936149120331, "kl": 0.005671827122569084, "learning_rate": 2.4330000000000003e-06, "loss": 0.0003, "num_tokens": 249353.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 15.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.10728318989276886, "kl": 0.01710322964936495, "learning_rate": 2.436e-06, "loss": 0.0009, "num_tokens": 249679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 15.074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.004523104056715965, "kl": 0.015971110202372074, "learning_rate": 2.439e-06, "loss": 0.0008, "num_tokens": 249939.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 15.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 3.3868942260742188, "kl": 0.02833088766783476, "learning_rate": 2.442e-06, "loss": -0.0053, "num_tokens": 250283.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 15.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.006653910502791405, "kl": 0.00028151705919299275, "learning_rate": 2.445e-06, "loss": 0.0, "num_tokens": 250601.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 15.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08972105383872986, "kl": 0.0036098076961934566, "learning_rate": 2.448e-06, "loss": 0.0002, "num_tokens": 250866.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.75, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 15.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 4.391178131103516, "kl": 0.018015262205153704, "learning_rate": 2.451e-06, "loss": 0.0543, "num_tokens": 251269.0, "reward": 4.125, "reward_std": 2.75, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 2.75, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 15.166666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.3129059672355652, "kl": 0.030778750777244568, "learning_rate": 2.4539999999999997e-06, "loss": 0.0015, "num_tokens": 251481.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 15.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 20.031448364257812, "kl": 0.008709351997822523, "learning_rate": 2.457e-06, "loss": -0.2137, "num_tokens": 251697.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 15.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.34016168117523193, "kl": 0.013878948986530304, "learning_rate": 2.4599999999999997e-06, "loss": 0.0007, "num_tokens": 251913.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 15.222222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 2.0805835723876953, "kl": 0.025030162185430527, "learning_rate": 2.463e-06, "loss": 0.011, "num_tokens": 252278.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 822 }, { "clip_ratio/high_max": 0.006849315017461777, "clip_ratio/high_mean": 0.006849315017461777, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006849315017461777, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 15.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.046453833580017, "kl": 0.011136360466480255, "learning_rate": 2.4659999999999998e-06, "loss": 0.0495, "num_tokens": 252691.0, "reward": 0.375, "reward_std": 0.25, "rewards/reward_combined/mean": 0.375, "rewards/reward_combined/std": 0.25, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 15.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.005311328452080488, "kl": 0.0026903748512268066, "learning_rate": 2.469e-06, "loss": 0.0001, "num_tokens": 252927.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 15.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.12365634739398956, "kl": 0.006539266090840101, "learning_rate": 2.472e-06, "loss": 0.0003, "num_tokens": 253223.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 15.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.3430609703063965, "kl": 0.01864812895655632, "learning_rate": 2.475e-06, "loss": -0.0173, "num_tokens": 253498.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 15.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00028615814517252147, "kl": 8.106231689453125e-06, "learning_rate": 2.478e-06, "loss": 0.0, "num_tokens": 253718.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 15.333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 7.083314895629883, "kl": 0.03541209362447262, "learning_rate": 2.481e-06, "loss": -0.0688, "num_tokens": 254024.0, "reward": 3.375, "reward_std": 2.9545164108276367, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 2.9545164108276367, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.010798932984471321, "kl": 0.01656962465494871, "learning_rate": 2.484e-06, "loss": 0.0008, "num_tokens": 254308.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 88.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 88.25, "completions/mean_terminated_length": 32.333335876464844, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 15.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.8000051975250244, "kl": 0.012468930799514055, "learning_rate": 2.487e-06, "loss": 0.4189, "num_tokens": 254885.0, "reward": 1.6749999523162842, "reward_std": 4.925021171569824, "rewards/reward_combined/mean": 1.6749999523162842, "rewards/reward_combined/std": 4.925021648406982, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 15.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04856877028942108, "kl": 0.006245983298867941, "learning_rate": 2.49e-06, "loss": 0.0003, "num_tokens": 255219.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 15.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.021499263122677803, "kl": 0.0004824884235858917, "learning_rate": 2.493e-06, "loss": 0.0, "num_tokens": 255463.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 15.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.002227855147793889, "kl": 0.0009088899241760373, "learning_rate": 2.496e-06, "loss": 0.0, "num_tokens": 255743.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.1993919461965561, "kl": 0.011588844936341047, "learning_rate": 2.499e-06, "loss": 0.0006, "num_tokens": 256014.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 70.25, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 70.25, "completions/mean_terminated_length": 70.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 15.462962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 3.380399227142334, "kl": 0.015295200049877167, "learning_rate": 2.502e-06, "loss": -0.0819, "num_tokens": 256515.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 15.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02020743303000927, "kl": 0.0024401472182944417, "learning_rate": 2.505e-06, "loss": 0.0001, "num_tokens": 256841.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 836 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01785714365541935, "completion_length": 75.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 75.25, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.8669129610061646, "kl": 0.018591200932860374, "learning_rate": 2.508e-06, "loss": 0.4382, "num_tokens": 257366.0, "reward": 5.0, "reward_std": 6.0, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 6.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.518518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.980868339538574, "kl": 0.06596332974731922, "learning_rate": 2.5109999999999998e-06, "loss": 0.1504, "num_tokens": 257679.0, "reward": 4.875, "reward_std": 3.1983067989349365, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.1983067989349365, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 15.537037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 4.717414855957031, "kl": 0.049179114401340485, "learning_rate": 2.514e-06, "loss": -0.0306, "num_tokens": 258011.0, "reward": 4.125, "reward_std": 2.25, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 2.25, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 15.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.025592342019081116, "kl": 0.0010536994668655097, "learning_rate": 2.5169999999999998e-06, "loss": 0.0001, "num_tokens": 258291.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 15.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.1575191020965576, "kl": 0.02428480051457882, "learning_rate": 2.52e-06, "loss": -0.0155, "num_tokens": 258654.0, "reward": 3.0, "reward_std": 3.188521146774292, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 3.188521146774292, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 77.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 77.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 15.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.552376747131348, "kl": 0.13188587129116058, "learning_rate": 2.523e-06, "loss": 0.421, "num_tokens": 259205.0, "reward": 4.175000190734863, "reward_std": 4.418427467346191, "rewards/reward_combined/mean": 4.175000190734863, "rewards/reward_combined/std": 4.418426990509033, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.011714904569089413, "kl": 0.0006037205748725682, "learning_rate": 2.526e-06, "loss": 0.0, "num_tokens": 259465.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03280490264296532, "kl": 0.0005672499537467957, "learning_rate": 2.529e-06, "loss": 0.0, "num_tokens": 259721.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 15.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 2.0065717697143555, "kl": 0.016092784702777863, "learning_rate": 2.532e-06, "loss": 0.4498, "num_tokens": 260253.0, "reward": 5.050000190734863, "reward_std": 4.900000095367432, "rewards/reward_combined/mean": 5.050000190734863, "rewards/reward_combined/std": 4.900000095367432, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.742649555206299, "kl": 0.030048758257180452, "learning_rate": 2.535e-06, "loss": 0.0033, "num_tokens": 260551.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 8.368227005004883, "kl": 0.023549416102468967, "learning_rate": 2.538e-06, "loss": -0.2378, "num_tokens": 260842.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 15.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.097448348999023, "kl": 0.03640543203800917, "learning_rate": 2.541e-06, "loss": -0.0501, "num_tokens": 261193.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 15.722222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.1348850280046463, "kl": 0.014819524250924587, "learning_rate": 2.544e-06, "loss": 0.0007, "num_tokens": 261461.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 15.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 8.457876205444336, "kl": 0.10042929649353027, "learning_rate": 2.547e-06, "loss": 0.0602, "num_tokens": 261674.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 15.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.18141864240169525, "kl": 0.009819218306802213, "learning_rate": 2.55e-06, "loss": 0.0005, "num_tokens": 261909.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 15.777777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.01111546903848648, "kl": 0.0038869944401085377, "learning_rate": 2.553e-06, "loss": 0.0002, "num_tokens": 262179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.796296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.720064640045166, "kl": 0.005797249847091734, "learning_rate": 2.556e-06, "loss": 0.0021, "num_tokens": 262469.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 15.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.499596118927002, "kl": 0.019651985203381628, "learning_rate": 2.559e-06, "loss": 0.0151, "num_tokens": 262782.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 15.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.40037572383880615, "kl": 0.017680106684565544, "learning_rate": 2.562e-06, "loss": 0.0009, "num_tokens": 263106.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 15.851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 6.041896343231201, "kl": 0.018168576061725616, "learning_rate": 2.565e-06, "loss": -0.0974, "num_tokens": 263418.0, "reward": 5.5, "reward_std": 2.886751174926758, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 2.886751413345337, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 15.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.9160261154174805, "kl": 0.036556024104356766, "learning_rate": 2.568e-06, "loss": 0.011, "num_tokens": 263751.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 15.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.762621283531189, "kl": 0.018396658822894096, "learning_rate": 2.571e-06, "loss": 0.0008, "num_tokens": 264043.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.907407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05802302062511444, "kl": 0.0029262282769195735, "learning_rate": 2.574e-06, "loss": 0.0002, "num_tokens": 264305.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 15.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.820932149887085, "kl": 0.007205143105238676, "learning_rate": 2.577e-06, "loss": -0.0397, "num_tokens": 264587.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 15.944444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.008702805265784264, "kl": 0.0010077431797981262, "learning_rate": 2.58e-06, "loss": 0.0001, "num_tokens": 264795.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.11642981320619583, "kl": 0.0075371162965893745, "learning_rate": 2.583e-06, "loss": 0.0004, "num_tokens": 265065.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 57.25, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.981481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.389289915561676, "kl": 0.026683930307626724, "learning_rate": 2.586e-06, "loss": 0.0015, "num_tokens": 265514.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 16.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.006408870220184326, "kl": 9.412899089511484e-05, "learning_rate": 2.589e-06, "loss": 0.0, "num_tokens": 265821.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 16.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.14997774362564087, "kl": 0.02597730467095971, "learning_rate": 2.592e-06, "loss": 0.0013, "num_tokens": 266103.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 16.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.08316528052091599, "kl": 0.009424427058547735, "learning_rate": 2.595e-06, "loss": 0.0005, "num_tokens": 266403.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 16.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.11386890709400177, "kl": 0.01250389963388443, "learning_rate": 2.598e-06, "loss": 0.0006, "num_tokens": 266639.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 16.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 1.8794735670089722, "kl": 0.1778415720909834, "learning_rate": 2.601e-06, "loss": 0.0097, "num_tokens": 266926.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 16.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.02523677609860897, "kl": 0.0018860953277908266, "learning_rate": 2.604e-06, "loss": 0.0001, "num_tokens": 267161.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 16.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04206731542944908, "kl": 0.002634609234519303, "learning_rate": 2.607e-06, "loss": 0.0001, "num_tokens": 267451.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 16.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.223171234130859, "kl": 0.01811151672154665, "learning_rate": 2.61e-06, "loss": 0.0318, "num_tokens": 267723.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 16.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08685275167226791, "kl": 0.0033728512935340405, "learning_rate": 2.6130000000000002e-06, "loss": 0.0002, "num_tokens": 267966.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 16.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.013421605341136456, "kl": 0.0006301686516962945, "learning_rate": 2.616e-06, "loss": 0.0, "num_tokens": 268280.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 16.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017196089029312134, "kl": 0.0010637480881996453, "learning_rate": 2.6190000000000003e-06, "loss": 0.0001, "num_tokens": 268560.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.014285714365541935, "clip_ratio/high_mean": 0.014285714365541935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014285714365541935, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 16.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 6.5222344398498535, "kl": 0.08057743683457375, "learning_rate": 2.622e-06, "loss": 0.257, "num_tokens": 268901.0, "reward": 6.375, "reward_std": 2.136000871658325, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.136000871658325, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 16.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.050898998975753784, "kl": 0.01073373481631279, "learning_rate": 2.6250000000000003e-06, "loss": 0.0005, "num_tokens": 269233.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 16.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00040665382402949035, "kl": 2.074986696243286e-05, "learning_rate": 2.628e-06, "loss": 0.0, "num_tokens": 269453.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 16.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.1193506717681885, "kl": 0.027223428711295128, "learning_rate": 2.631e-06, "loss": 0.0201, "num_tokens": 269794.0, "reward": 2.5, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.34165620803833, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 16.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 6.142059803009033, "kl": 0.09255321323871613, "learning_rate": 2.634e-06, "loss": 0.233, "num_tokens": 270165.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 16.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.2199128121137619, "kl": 0.006227896548807621, "learning_rate": 2.637e-06, "loss": 0.0005, "num_tokens": 270392.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 16.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.009426550008356571, "kl": 0.0012574732536450028, "learning_rate": 2.64e-06, "loss": 0.0001, "num_tokens": 270612.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01666666753590107, "clip_ratio/low_min": 0.01666666753590107, "clip_ratio/region_mean": 0.01666666753590107, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 16.333333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 5.762788772583008, "kl": 0.059044482884928584, "learning_rate": 2.643e-06, "loss": 0.0679, "num_tokens": 270888.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 16.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.007317017298191786, "kl": 0.0015296489582397044, "learning_rate": 2.646e-06, "loss": 0.0001, "num_tokens": 271148.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 16.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.006072989199310541, "kl": 8.368250928469934e-05, "learning_rate": 2.649e-06, "loss": 0.0, "num_tokens": 271455.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 16.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 6.729745864868164, "kl": 0.04546512849628925, "learning_rate": 2.652e-06, "loss": 0.1228, "num_tokens": 271787.0, "reward": 2.0, "reward_std": 2.4494898319244385, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 2.4494898319244385, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 16.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.02424285188317299, "kl": 0.0010086670517921448, "learning_rate": 2.655e-06, "loss": 0.0001, "num_tokens": 271999.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 16.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.06065645441412926, "kl": 0.03292999789118767, "learning_rate": 2.6580000000000002e-06, "loss": 0.0016, "num_tokens": 272299.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 16.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.10115627944469452, "kl": 0.018276115879416466, "learning_rate": 2.661e-06, "loss": 0.0009, "num_tokens": 272665.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 16.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.175871849060059, "kl": 0.013722633011639118, "learning_rate": 2.6640000000000002e-06, "loss": -0.0609, "num_tokens": 272955.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 16.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08490806072950363, "kl": 0.0038473325548693538, "learning_rate": 2.667e-06, "loss": 0.0002, "num_tokens": 273227.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 16.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.08408478647470474, "kl": 0.014603359624743462, "learning_rate": 2.6700000000000003e-06, "loss": 0.0007, "num_tokens": 273559.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 16.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03706580400466919, "kl": 0.003242477774620056, "learning_rate": 2.673e-06, "loss": 0.0002, "num_tokens": 273771.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 16.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.11515523493289948, "kl": 0.013682656921446323, "learning_rate": 2.6760000000000003e-06, "loss": 0.0007, "num_tokens": 274053.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 16.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0946512445807457, "kl": 0.0027261764043942094, "learning_rate": 2.679e-06, "loss": 0.0001, "num_tokens": 274309.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 16.574074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 10.735785484313965, "kl": 0.17699826508760452, "learning_rate": 2.6820000000000003e-06, "loss": 0.0939, "num_tokens": 274586.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 16.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.027571633458137512, "kl": 0.0007305496255867183, "learning_rate": 2.685e-06, "loss": 0.0, "num_tokens": 274906.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 16.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 7.860959529876709, "kl": 0.1105173472315073, "learning_rate": 2.688e-06, "loss": 0.0969, "num_tokens": 275218.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 16.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04810880124568939, "kl": 0.001539662480354309, "learning_rate": 2.691e-06, "loss": 0.0001, "num_tokens": 275428.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 16.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03662363812327385, "kl": 0.0009340256219729781, "learning_rate": 2.694e-06, "loss": 0.0, "num_tokens": 275688.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 16.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 1.4957729578018188, "kl": 0.10789931565523148, "learning_rate": 2.697e-06, "loss": 0.0042, "num_tokens": 276052.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 16.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.046485308557748795, "kl": 0.0034694699570536613, "learning_rate": 2.7e-06, "loss": 0.0002, "num_tokens": 276324.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 16.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.00432240916416049, "kl": 0.016069352626800537, "learning_rate": 2.703e-06, "loss": 0.0008, "num_tokens": 276584.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 16.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.06279173493385315, "kl": 0.0014365874230861664, "learning_rate": 2.706e-06, "loss": 0.0001, "num_tokens": 276844.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 16.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03251232951879501, "kl": 0.019935129210352898, "learning_rate": 2.7090000000000002e-06, "loss": 0.001, "num_tokens": 277140.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007462686393409967, "clip_ratio/low_min": 0.007462686393409967, "clip_ratio/region_mean": 0.007462686393409967, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 16.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.653429985046387, "kl": 0.047437798231840134, "learning_rate": 2.712e-06, "loss": 0.1554, "num_tokens": 277493.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 16.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.010527708567678928, "kl": 0.0014890655875205994, "learning_rate": 2.7150000000000003e-06, "loss": 0.0001, "num_tokens": 277805.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 16.796296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 2.460294008255005, "kl": 0.0361027829349041, "learning_rate": 2.718e-06, "loss": -0.1019, "num_tokens": 278240.0, "reward": 2.174999952316284, "reward_std": 1.1786290407180786, "rewards/reward_combined/mean": 2.174999952316284, "rewards/reward_combined/std": 1.1786291599273682, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 16.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0483318492770195, "kl": 0.00674024224281311, "learning_rate": 2.7210000000000003e-06, "loss": 0.0003, "num_tokens": 278508.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 16.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.05405644699931145, "kl": 0.0017870822339318693, "learning_rate": 2.724e-06, "loss": 0.0001, "num_tokens": 278824.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 16.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 7.086075782775879, "kl": 0.009135594591498375, "learning_rate": 2.7270000000000003e-06, "loss": 0.1567, "num_tokens": 279173.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 69.75, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 69.75, "completions/mean_terminated_length": 69.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 16.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.284515142440796, "kl": 0.03399188816547394, "learning_rate": 2.73e-06, "loss": 0.4709, "num_tokens": 279672.0, "reward": 3.625, "reward_std": 5.202163219451904, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 5.202163219451904, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 16.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07876795530319214, "kl": 0.0020766069064848125, "learning_rate": 2.7330000000000003e-06, "loss": 0.0001, "num_tokens": 279968.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 16.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 4.349666595458984, "kl": 0.06186091527342796, "learning_rate": 2.736e-06, "loss": -0.001, "num_tokens": 280296.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 16.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.14858929812908173, "kl": 0.010949058923870325, "learning_rate": 2.7390000000000004e-06, "loss": 0.0005, "num_tokens": 280566.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 16.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.01540637481957674, "kl": 0.004458198556676507, "learning_rate": 2.742e-06, "loss": 0.0002, "num_tokens": 280836.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 16.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 7.211705684661865, "kl": 0.02147698076441884, "learning_rate": 2.745e-06, "loss": 0.0209, "num_tokens": 281114.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 16.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.836970090866089, "kl": 0.02658749930560589, "learning_rate": 2.748e-06, "loss": 0.0343, "num_tokens": 281433.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 17.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.061885833740234, "kl": 0.21568666584789753, "learning_rate": 2.751e-06, "loss": 0.0096, "num_tokens": 281738.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.024867020547389984, "kl": 0.0009969845414161682, "learning_rate": 2.7540000000000002e-06, "loss": 0.0, "num_tokens": 281950.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 17.037037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 6.72955322265625, "kl": 0.014656886691227555, "learning_rate": 2.757e-06, "loss": 0.0055, "num_tokens": 282220.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 17.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.009860399179160595, "kl": 0.0010821044561453164, "learning_rate": 2.7600000000000003e-06, "loss": 0.0001, "num_tokens": 282440.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 17.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.20747016370296478, "kl": 0.014619201421737671, "learning_rate": 2.763e-06, "loss": 0.0007, "num_tokens": 282652.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 17.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.4475628137588501, "kl": 0.05984331271611154, "learning_rate": 2.7660000000000003e-06, "loss": 0.0023, "num_tokens": 282974.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 17.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.306441783905029, "kl": 0.03935196250677109, "learning_rate": 2.769e-06, "loss": 0.0715, "num_tokens": 283339.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 17.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.366795063018799, "kl": 0.0751175731420517, "learning_rate": 2.7720000000000003e-06, "loss": -0.0527, "num_tokens": 283680.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 17.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02690128982067108, "kl": 0.0009666383266448975, "learning_rate": 2.775e-06, "loss": 0.0, "num_tokens": 283960.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 17.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.04740017652511597, "kl": 0.0076216175220906734, "learning_rate": 2.7780000000000003e-06, "loss": 0.0004, "num_tokens": 284289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 17.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.004154135473072529, "kl": 0.016127178445458412, "learning_rate": 2.781e-06, "loss": 0.0008, "num_tokens": 284549.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 17.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.09007273614406586, "kl": 0.017842161934822798, "learning_rate": 2.7840000000000004e-06, "loss": 0.0009, "num_tokens": 284882.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 17.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.6361002922058105, "kl": 0.09756945073604584, "learning_rate": 2.787e-06, "loss": 0.0767, "num_tokens": 285235.0, "reward": 3.875, "reward_std": 2.9545164108276367, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 2.9545164108276367, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 17.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.008572395890951157, "kl": 0.0013220729306340218, "learning_rate": 2.7900000000000004e-06, "loss": 0.0001, "num_tokens": 285547.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 17.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 7.750455856323242, "kl": 0.12378092110157013, "learning_rate": 2.793e-06, "loss": 0.1037, "num_tokens": 285854.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 17.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.17185324430465698, "kl": 0.02490962017327547, "learning_rate": 2.7960000000000004e-06, "loss": 0.0012, "num_tokens": 286147.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 17.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 6.1884236335754395, "kl": 0.002807863100315444, "learning_rate": 2.7990000000000002e-06, "loss": -0.0004, "num_tokens": 286403.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 17.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.06539967656135559, "kl": 0.005298139003571123, "learning_rate": 2.802e-06, "loss": 0.0003, "num_tokens": 286733.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 17.333333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 4.5376152992248535, "kl": 0.03906891401857138, "learning_rate": 2.8050000000000002e-06, "loss": 0.015, "num_tokens": 287027.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 17.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.018045363947749138, "kl": 0.005035794340074062, "learning_rate": 2.808e-06, "loss": 0.0003, "num_tokens": 287315.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 17.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 1.0250736474990845, "kl": 0.07505254819989204, "learning_rate": 2.8110000000000003e-06, "loss": 0.0038, "num_tokens": 287578.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 17.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.020768439397215843, "kl": 0.020604564808309078, "learning_rate": 2.814e-06, "loss": 0.001, "num_tokens": 287870.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 17.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 3.393409252166748, "kl": 0.0839981846511364, "learning_rate": 2.817e-06, "loss": 0.1574, "num_tokens": 288236.0, "reward": 1.875, "reward_std": 1.314977765083313, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 1.3149778842926025, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 17.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.18137066066265106, "kl": 0.00959052995312959, "learning_rate": 2.82e-06, "loss": 0.0005, "num_tokens": 288497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 17.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.05492454767227173, "kl": 0.009323860984295607, "learning_rate": 2.823e-06, "loss": 0.0005, "num_tokens": 288783.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 942 }, { "clip_ratio/high_max": 0.0024271844886243343, "clip_ratio/high_mean": 0.0024271844886243343, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024271844886243343, "completion_length": 66.25, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 66.25, "completions/mean_terminated_length": 66.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 17.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.7071022987365723, "kl": 0.06320535112172365, "learning_rate": 2.8259999999999997e-06, "loss": 0.0383, "num_tokens": 289264.0, "reward": 2.0, "reward_std": 1.471960186958313, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 1.471960186958313, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 17.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.644867420196533, "kl": 0.04936415143311024, "learning_rate": 2.829e-06, "loss": 0.0008, "num_tokens": 289567.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 17.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.2899612188339233, "kl": 0.02315691113471985, "learning_rate": 2.8319999999999997e-06, "loss": -0.0807, "num_tokens": 289979.0, "reward": 2.174999952316284, "reward_std": 1.1786291599273682, "rewards/reward_combined/mean": 2.174999952316284, "rewards/reward_combined/std": 1.1786291599273682, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 17.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.6830765008926392, "kl": 0.06120620295405388, "learning_rate": 2.835e-06, "loss": 0.0034, "num_tokens": 290261.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 17.537037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 5.678679466247559, "kl": 0.030490998411551118, "learning_rate": 2.8379999999999998e-06, "loss": 0.099, "num_tokens": 290537.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 17.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.06989409774541855, "kl": 0.0022222733023227192, "learning_rate": 2.841e-06, "loss": 0.0001, "num_tokens": 290772.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 17.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.09559326618909836, "kl": 0.0045561735751107335, "learning_rate": 2.844e-06, "loss": 0.0002, "num_tokens": 291042.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 17.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.11788101494312286, "kl": 0.0019803866744041443, "learning_rate": 2.847e-06, "loss": 0.0001, "num_tokens": 291248.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 17.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 7.919772148132324, "kl": 0.005017139395931736, "learning_rate": 2.85e-06, "loss": 0.1366, "num_tokens": 291518.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 17.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02758970484137535, "kl": 0.0008264200441772118, "learning_rate": 2.853e-06, "loss": 0.0, "num_tokens": 291824.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 17.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027528919745236635, "kl": 0.0007725166215095669, "learning_rate": 2.856e-06, "loss": 0.0, "num_tokens": 292104.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 17.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.015700655058026314, "kl": 0.00035790354013442993, "learning_rate": 2.859e-06, "loss": 0.0, "num_tokens": 292348.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 17.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.4215909242630005, "kl": 0.05775163508951664, "learning_rate": 2.862e-06, "loss": 0.0031, "num_tokens": 292670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 17.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 5.383664131164551, "kl": 0.04355753492563963, "learning_rate": 2.865e-06, "loss": -0.1513, "num_tokens": 293015.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 17.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.007158689200878143, "kl": 0.00010698745609261096, "learning_rate": 2.868e-06, "loss": 0.0, "num_tokens": 293336.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 17.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10804415494203568, "kl": 0.008159431832609698, "learning_rate": 2.871e-06, "loss": 0.0004, "num_tokens": 293645.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 17.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.1420525163412094, "kl": 0.005328859901055694, "learning_rate": 2.874e-06, "loss": 0.0003, "num_tokens": 293941.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 17.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.02422979660332203, "kl": 0.09467829018831253, "learning_rate": 2.877e-06, "loss": 0.0047, "num_tokens": 294307.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 17.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.00691164331510663, "kl": 0.0017841786611825228, "learning_rate": 2.88e-06, "loss": 0.0001, "num_tokens": 294567.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 17.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.05318329483270645, "kl": 0.03480059280991554, "learning_rate": 2.883e-06, "loss": 0.0017, "num_tokens": 294867.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004444839432835579, "kl": 4.042685031890869e-05, "learning_rate": 2.886e-06, "loss": 0.0, "num_tokens": 295087.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02944686822593212, "kl": 0.0016392802353948355, "learning_rate": 2.8889999999999998e-06, "loss": 0.0001, "num_tokens": 295314.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 17.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.5267839431762695, "kl": 0.09300582110881805, "learning_rate": 2.892e-06, "loss": -0.0358, "num_tokens": 295622.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 13.580138206481934, "kl": 0.023242179304361343, "learning_rate": 2.895e-06, "loss": 0.1713, "num_tokens": 295862.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 17.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 4.606494903564453, "kl": 0.05697191320359707, "learning_rate": 2.898e-06, "loss": 0.0362, "num_tokens": 296148.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 17.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 7.292166709899902, "kl": 0.02170123066753149, "learning_rate": 2.901e-06, "loss": 0.0653, "num_tokens": 296433.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 17.944444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 6.754703521728516, "kl": 0.013701246120035648, "learning_rate": 2.904e-06, "loss": 0.0002, "num_tokens": 296778.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 17.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 8.046321868896484, "kl": 0.02073330502025783, "learning_rate": 2.907e-06, "loss": 0.0146, "num_tokens": 297048.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 17.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.040881071239709854, "kl": 0.018811689253197983, "learning_rate": 2.91e-06, "loss": 0.0009, "num_tokens": 297336.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 18.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.15847693383693695, "kl": 0.003819052129983902, "learning_rate": 2.913e-06, "loss": 0.0002, "num_tokens": 297596.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 18.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04494069516658783, "kl": 0.0062313086818903685, "learning_rate": 2.916e-06, "loss": 0.0003, "num_tokens": 297854.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 18.037037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.149745464324951, "kl": 0.014793979469686747, "learning_rate": 2.919e-06, "loss": 0.0001, "num_tokens": 298285.0, "reward": 1.8499999046325684, "reward_std": 0.9110432863235474, "rewards/reward_combined/mean": 1.8499999046325684, "rewards/reward_combined/std": 0.9110434055328369, "step": 974 }, { "clip_ratio/high_max": 0.017241379246115685, "clip_ratio/high_mean": 0.017241379246115685, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017241379246115685, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 18.055555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 6.219341278076172, "kl": 0.041466670110821724, "learning_rate": 2.922e-06, "loss": 0.0819, "num_tokens": 298566.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 18.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.1388024240732193, "kl": 0.010825720615684986, "learning_rate": 2.925e-06, "loss": 0.0005, "num_tokens": 298822.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 18.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.036770425736904144, "kl": 0.00320415198802948, "learning_rate": 2.928e-06, "loss": 0.0002, "num_tokens": 299034.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 18.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.16454218327999115, "kl": 0.026634372770786285, "learning_rate": 2.931e-06, "loss": 0.0013, "num_tokens": 299356.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 18.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029778636526316404, "kl": 0.016340223141014576, "learning_rate": 2.934e-06, "loss": 0.0008, "num_tokens": 299616.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 18.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10779254138469696, "kl": 0.0024613887071609497, "learning_rate": 2.937e-06, "loss": 0.0001, "num_tokens": 299824.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 18.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.00044605552102439106, "kl": 4.427880048751831e-05, "learning_rate": 2.9400000000000002e-06, "loss": 0.0, "num_tokens": 300044.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 18.185185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 6.850919246673584, "kl": 0.014800236793234944, "learning_rate": 2.943e-06, "loss": 0.0149, "num_tokens": 300322.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 18.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 2.76835036277771, "kl": 0.03857973590493202, "learning_rate": 2.946e-06, "loss": 0.02, "num_tokens": 300635.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 18.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 5.330134868621826, "kl": 0.0038978730226517655, "learning_rate": 2.949e-06, "loss": 0.0399, "num_tokens": 300957.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 18.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 7.354736804962158, "kl": 0.01612484361976385, "learning_rate": 2.952e-06, "loss": -0.0576, "num_tokens": 301224.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 18.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.8794246912002563, "kl": 0.13801612704992294, "learning_rate": 2.955e-06, "loss": 0.0071, "num_tokens": 301526.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 18.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020747825037688017, "kl": 0.00010592797480057925, "learning_rate": 2.958e-06, "loss": 0.0, "num_tokens": 301834.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 18.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.6713685393333435, "kl": 0.1279423087835312, "learning_rate": 2.961e-06, "loss": 0.0061, "num_tokens": 302168.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 18.314814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 5.612220287322998, "kl": 0.10975046455860138, "learning_rate": 2.964e-06, "loss": -0.0253, "num_tokens": 302497.0, "reward": 3.75, "reward_std": 2.872281312942505, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 2.872281312942505, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 18.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0224091075360775, "kl": 0.009532616473734379, "learning_rate": 2.967e-06, "loss": 0.0005, "num_tokens": 302779.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 18.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03791513666510582, "kl": 0.013638301170431077, "learning_rate": 2.97e-06, "loss": 0.0008, "num_tokens": 303066.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 18.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07310573011636734, "kl": 0.01153097813948989, "learning_rate": 2.973e-06, "loss": 0.0006, "num_tokens": 303409.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013513513840734959, "clip_ratio/low_min": 0.013513513840734959, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 18.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.885204792022705, "kl": 0.2055332437157631, "learning_rate": 2.976e-06, "loss": -0.0392, "num_tokens": 303708.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 18.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.1347719132900238, "kl": 0.005800263257697225, "learning_rate": 2.979e-06, "loss": 0.0003, "num_tokens": 303942.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 18.425925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 4.487607002258301, "kl": 0.013947858475148678, "learning_rate": 2.982e-06, "loss": -0.1081, "num_tokens": 304266.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 18.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.02589021995663643, "kl": 0.0011689886450767517, "learning_rate": 2.9850000000000002e-06, "loss": 0.0001, "num_tokens": 304478.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 18.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.15449932217597961, "kl": 0.025850625708699226, "learning_rate": 2.988e-06, "loss": 0.0013, "num_tokens": 304777.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 18.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.11922191828489304, "kl": 0.014185238629579544, "learning_rate": 2.9910000000000002e-06, "loss": 0.0007, "num_tokens": 305116.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 18.5, "frac_reward_zero_std": 0.0, "grad_norm": 5.687262058258057, "kl": 0.07127954624593258, "learning_rate": 2.994e-06, "loss": -0.0903, "num_tokens": 305485.0, "reward": 1.875, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 1.6007810831069946, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 18.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 2.7668983936309814, "kl": 0.2996699586510658, "learning_rate": 2.9970000000000003e-06, "loss": 0.0164, "num_tokens": 305804.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 18.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.010628429241478443, "kl": 0.004377992358058691, "learning_rate": 3e-06, "loss": 0.0002, "num_tokens": 306074.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 18.555555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 1.9562952518463135, "kl": 0.006490831729024649, "learning_rate": 2.999666666666667e-06, "loss": -0.0002, "num_tokens": 306362.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 18.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.20808644592761993, "kl": 0.01005901675671339, "learning_rate": 2.9993333333333332e-06, "loss": 0.0006, "num_tokens": 306630.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 18.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.14580221474170685, "kl": 0.008147581713274121, "learning_rate": 2.999e-06, "loss": 0.0004, "num_tokens": 306906.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 18.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.13075172901153564, "kl": 0.06699612364172935, "learning_rate": 2.9986666666666668e-06, "loss": 0.0033, "num_tokens": 307174.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 18.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05798104405403137, "kl": 0.0031279143877327442, "learning_rate": 2.9983333333333336e-06, "loss": 0.0002, "num_tokens": 307436.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 18.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04234755411744118, "kl": 0.0017250796081498265, "learning_rate": 2.998e-06, "loss": 0.0001, "num_tokens": 307706.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 18.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.02651769295334816, "kl": 0.020571790635585785, "learning_rate": 2.9976666666666667e-06, "loss": 0.001, "num_tokens": 308000.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 18.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.028096335008740425, "kl": 0.09387857094407082, "learning_rate": 2.997333333333333e-06, "loss": 0.0047, "num_tokens": 308364.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.006849315017461777, "clip_ratio/high_mean": 0.006849315017461777, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006849315017461777, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 18.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 6.333775043487549, "kl": 0.03270596917718649, "learning_rate": 2.9970000000000003e-06, "loss": 0.0563, "num_tokens": 308733.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 18.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.015856772661209106, "kl": 0.0004476197063922882, "learning_rate": 2.996666666666667e-06, "loss": 0.0, "num_tokens": 308993.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 18.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.2930438816547394, "kl": 0.033293591812253, "learning_rate": 2.9963333333333334e-06, "loss": 0.0017, "num_tokens": 309282.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 18.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.213979721069336, "kl": 0.10456418618559837, "learning_rate": 2.996e-06, "loss": -0.0101, "num_tokens": 309591.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 18.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 5.028774261474609, "kl": 0.009066569808055647, "learning_rate": 2.9956666666666666e-06, "loss": -0.0029, "num_tokens": 309851.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 18.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.1365141123533249, "kl": 0.013530088821426034, "learning_rate": 2.9953333333333333e-06, "loss": 0.0006, "num_tokens": 310162.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 18.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.19325755536556244, "kl": 0.05480775982141495, "learning_rate": 2.995e-06, "loss": 0.0027, "num_tokens": 310520.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 18.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.11743105947971344, "kl": 0.017631690949201584, "learning_rate": 2.994666666666667e-06, "loss": 0.0009, "num_tokens": 310816.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 18.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04771581292152405, "kl": 0.01937988307327032, "learning_rate": 2.9943333333333333e-06, "loss": 0.001, "num_tokens": 311150.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 18.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.864377021789551, "kl": 0.03459107130765915, "learning_rate": 2.994e-06, "loss": 0.109, "num_tokens": 311462.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 18.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.010085365734994411, "kl": 0.0010973572498187423, "learning_rate": 2.993666666666667e-06, "loss": 0.0001, "num_tokens": 311682.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 18.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.054485615342855453, "kl": 0.008097039069980383, "learning_rate": 2.993333333333333e-06, "loss": 0.0004, "num_tokens": 311956.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 18.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 6.920691013336182, "kl": 0.10139943659305573, "learning_rate": 2.993e-06, "loss": 0.0205, "num_tokens": 312241.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 18.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0073060947470366955, "kl": 0.0007110536098480225, "learning_rate": 2.9926666666666668e-06, "loss": 0.0, "num_tokens": 312485.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.014705882407724857, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014705882407724857, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 18.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.789229154586792, "kl": 0.05072168447077274, "learning_rate": 2.9923333333333335e-06, "loss": -0.0561, "num_tokens": 312790.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 18.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028168451972305775, "kl": 0.003279261291027069, "learning_rate": 2.992e-06, "loss": 0.0002, "num_tokens": 313026.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 19.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.061545904725790024, "kl": 0.0036889008479192853, "learning_rate": 2.9916666666666667e-06, "loss": 0.0002, "num_tokens": 313360.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 19.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04298610985279083, "kl": 0.0021496829576790333, "learning_rate": 2.9913333333333335e-06, "loss": 0.0001, "num_tokens": 313622.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.2620873749256134, "kl": 0.036713266745209694, "learning_rate": 2.9910000000000002e-06, "loss": 0.0019, "num_tokens": 313923.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 19.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.16502098739147186, "kl": 0.05797074735164642, "learning_rate": 2.990666666666667e-06, "loss": 0.0029, "num_tokens": 314191.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 19.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.07259169965982437, "kl": 0.01000349223613739, "learning_rate": 2.9903333333333334e-06, "loss": 0.0005, "num_tokens": 314503.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.545125424861908, "kl": 0.05854676757007837, "learning_rate": 2.99e-06, "loss": 0.0028, "num_tokens": 314772.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 19.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.6622626781463623, "kl": 0.09639779478311539, "learning_rate": 2.9896666666666665e-06, "loss": -0.0059, "num_tokens": 315139.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 19.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.6983280777931213, "kl": 0.09250823222100735, "learning_rate": 2.9893333333333333e-06, "loss": 0.0045, "num_tokens": 315430.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 19.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.059819430112838745, "kl": 0.008328840602189302, "learning_rate": 2.989e-06, "loss": 0.0004, "num_tokens": 315776.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.166666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 1.860781192779541, "kl": 0.020166019443422556, "learning_rate": 2.988666666666667e-06, "loss": 0.4523, "num_tokens": 316291.0, "reward": 4.875, "reward_std": 5.25, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 5.25, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 19.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.05397823452949524, "kl": 0.005703333066776395, "learning_rate": 2.9883333333333332e-06, "loss": 0.0003, "num_tokens": 316563.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 19.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 1.9393424987792969, "kl": 0.17014753073453903, "learning_rate": 2.988e-06, "loss": -0.0586, "num_tokens": 316926.0, "reward": 4.25, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 2.1794495582580566, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.2093377113342285, "kl": 0.016108008101582527, "learning_rate": 2.987666666666667e-06, "loss": 0.0019, "num_tokens": 317255.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 19.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03911317139863968, "kl": 0.0030389464809559286, "learning_rate": 2.987333333333333e-06, "loss": 0.0002, "num_tokens": 317569.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00023904952104203403, "kl": 0.00010310113430023193, "learning_rate": 2.9870000000000004e-06, "loss": 0.0, "num_tokens": 317789.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.1512259840965271, "kl": 0.003034040331840515, "learning_rate": 2.9866666666666667e-06, "loss": 0.0002, "num_tokens": 318001.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 19.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.11819654703140259, "kl": 0.00561297032982111, "learning_rate": 2.9863333333333335e-06, "loss": 0.0003, "num_tokens": 318244.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 19.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.08559706807136536, "kl": 0.004623609886039048, "learning_rate": 2.986e-06, "loss": 0.0002, "num_tokens": 318514.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.002773053478449583, "kl": 0.0033064335584640503, "learning_rate": 2.9856666666666667e-06, "loss": 0.0002, "num_tokens": 318750.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08593981713056564, "kl": 0.023099279031157494, "learning_rate": 2.9853333333333334e-06, "loss": 0.0012, "num_tokens": 319041.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.5635592937469482, "kl": 0.0808534175157547, "learning_rate": 2.9850000000000002e-06, "loss": 0.3652, "num_tokens": 319421.0, "reward": 1.75, "reward_std": 3.947572946548462, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 3.947573184967041, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 19.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.021194322034716606, "kl": 0.0006624294037465006, "learning_rate": 2.984666666666667e-06, "loss": 0.0, "num_tokens": 319742.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 19.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.14266392588615417, "kl": 0.044841449707746506, "learning_rate": 2.9843333333333334e-06, "loss": 0.0022, "num_tokens": 320105.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 19.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.00994281005114317, "kl": 0.0006901979213580489, "learning_rate": 2.984e-06, "loss": 0.0, "num_tokens": 320325.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1049 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.01315789483487606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01315789483487606, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 5.721512317657471, "kl": 0.15218839049339294, "learning_rate": 2.9836666666666665e-06, "loss": -0.0199, "num_tokens": 320625.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 19.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.004217702895402908, "kl": 0.016147812828421593, "learning_rate": 2.9833333333333333e-06, "loss": 0.0008, "num_tokens": 320885.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.056700922548770905, "kl": 0.001943156123161316, "learning_rate": 2.983e-06, "loss": 0.0001, "num_tokens": 321104.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 19.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.6867146492004395, "kl": 0.007495361380279064, "learning_rate": 2.982666666666667e-06, "loss": 0.0176, "num_tokens": 321393.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 19.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 11.446792602539062, "kl": 0.0003892386448569596, "learning_rate": 2.982333333333333e-06, "loss": 0.0249, "num_tokens": 321690.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 19.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.05403664708137512, "kl": 0.005810579285025597, "learning_rate": 2.982e-06, "loss": 0.0003, "num_tokens": 322022.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 19.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.018789371475577354, "kl": 0.0008191429078578949, "learning_rate": 2.9816666666666668e-06, "loss": 0.0, "num_tokens": 322282.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.574074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 13.297018051147461, "kl": 0.05140158161520958, "learning_rate": 2.9813333333333336e-06, "loss": -0.1298, "num_tokens": 322513.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.59259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 4.219817638397217, "kl": 0.3111524060368538, "learning_rate": 2.9810000000000003e-06, "loss": 0.0311, "num_tokens": 322815.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 19.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.014365162700414658, "kl": 0.0004117531352676451, "learning_rate": 2.9806666666666667e-06, "loss": 0.0, "num_tokens": 323095.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.09150257706642151, "kl": 0.14653567969799042, "learning_rate": 2.9803333333333335e-06, "loss": 0.0073, "num_tokens": 323406.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08486936241388321, "kl": 0.009652079083025455, "learning_rate": 2.98e-06, "loss": 0.0005, "num_tokens": 323695.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.30423662066459656, "kl": 0.024303349666297436, "learning_rate": 2.9796666666666666e-06, "loss": 0.0013, "num_tokens": 323997.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 19.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.06946351379156113, "kl": 0.00913584278896451, "learning_rate": 2.9793333333333334e-06, "loss": 0.0005, "num_tokens": 324271.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 19.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.13158056139945984, "kl": 0.0073317347560077906, "learning_rate": 2.979e-06, "loss": 0.0004, "num_tokens": 324527.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 7.627601146697998, "kl": 0.07835894823074341, "learning_rate": 2.978666666666667e-06, "loss": 0.1095, "num_tokens": 324810.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 19.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 7.50330114364624, "kl": 0.012882797745987773, "learning_rate": 2.9783333333333333e-06, "loss": 0.2183, "num_tokens": 325086.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 81.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 81.75, "completions/mean_terminated_length": 23.666667938232422, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 19.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.9908783435821533, "kl": 0.036355357617139816, "learning_rate": 2.978e-06, "loss": 0.4558, "num_tokens": 325665.0, "reward": 2.924999952316284, "reward_std": 5.566791534423828, "rewards/reward_combined/mean": 2.924999952316284, "rewards/reward_combined/std": 5.566791534423828, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.3228541612625122, "kl": 0.10724344104528427, "learning_rate": 2.9776666666666665e-06, "loss": 0.0054, "num_tokens": 325938.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.021391578018665314, "kl": 0.009398100432008505, "learning_rate": 2.9773333333333333e-06, "loss": 0.0005, "num_tokens": 326222.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.5, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 19.814814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 3.63429594039917, "kl": 0.06384639628231525, "learning_rate": 2.977e-06, "loss": 0.22, "num_tokens": 326628.0, "reward": 2.375, "reward_std": 1.314977765083313, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.3149778842926025, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021955263800919056, "kl": 0.00012518552830442786, "learning_rate": 2.976666666666667e-06, "loss": 0.0, "num_tokens": 326936.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 19.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.8048208951950073, "kl": 0.01688295044004917, "learning_rate": 2.9763333333333336e-06, "loss": -0.0015, "num_tokens": 327349.0, "reward": 1.625, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.6007810831069946, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 19.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08759452402591705, "kl": 0.006184890866279602, "learning_rate": 2.976e-06, "loss": 0.0003, "num_tokens": 327557.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 19.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.035650063306093216, "kl": 0.002114921808242798, "learning_rate": 2.9756666666666667e-06, "loss": 0.0001, "num_tokens": 327769.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00909090880304575, "clip_ratio/low_min": 0.00909090880304575, "clip_ratio/region_mean": 0.00909090880304575, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 19.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 5.1686177253723145, "kl": 0.1576797552406788, "learning_rate": 2.9753333333333335e-06, "loss": -0.0046, "num_tokens": 328102.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 19.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 4.954868793487549, "kl": 0.01296089543029666, "learning_rate": 2.9750000000000003e-06, "loss": -0.0126, "num_tokens": 328436.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 19.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020211064256727695, "kl": 0.0009937775903381407, "learning_rate": 2.9746666666666667e-06, "loss": 0.0, "num_tokens": 328716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 19.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 6.463118553161621, "kl": 0.08824754506349564, "learning_rate": 2.9743333333333335e-06, "loss": 0.072, "num_tokens": 329051.0, "reward": 3.75, "reward_std": 2.872281312942505, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 2.872281312942505, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 19.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.703159809112549, "kl": 0.013251371681690216, "learning_rate": 2.974e-06, "loss": -0.0167, "num_tokens": 329383.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 20.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.30451029539108276, "kl": 0.02358182705938816, "learning_rate": 2.9736666666666666e-06, "loss": 0.0011, "num_tokens": 329646.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 20.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.164877414703369, "kl": 0.08919530734419823, "learning_rate": 2.9733333333333334e-06, "loss": -0.041, "num_tokens": 330003.0, "reward": 2.375, "reward_std": 1.314977765083313, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.3149778842926025, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.09259263426065445, "kl": 0.007950606057420373, "learning_rate": 2.973e-06, "loss": 0.0004, "num_tokens": 330283.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 20.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.010594749823212624, "kl": 0.00932190753519535, "learning_rate": 2.972666666666667e-06, "loss": 0.0005, "num_tokens": 330555.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.074074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.488523483276367, "kl": 0.18745465204119682, "learning_rate": 2.9723333333333333e-06, "loss": 0.0489, "num_tokens": 330862.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 20.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.003421426983550191, "kl": 0.01631779409945011, "learning_rate": 2.972e-06, "loss": 0.0008, "num_tokens": 331122.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 20.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1423378735780716, "kl": 0.013231783639639616, "learning_rate": 2.9716666666666664e-06, "loss": 0.0007, "num_tokens": 331411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 20.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.477877140045166, "kl": 0.1347007192671299, "learning_rate": 2.9713333333333337e-06, "loss": 0.0281, "num_tokens": 331761.0, "reward": 3.625, "reward_std": 2.8975563049316406, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 2.8975565433502197, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 20.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.31940189003944397, "kl": 0.02269493043422699, "learning_rate": 2.971e-06, "loss": 0.0011, "num_tokens": 332017.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.166666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 2.857755184173584, "kl": 0.02816795534454286, "learning_rate": 2.970666666666667e-06, "loss": 0.1688, "num_tokens": 332298.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 20.185185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 6.73239278793335, "kl": 0.10749359801411629, "learning_rate": 2.9703333333333336e-06, "loss": 0.011, "num_tokens": 332624.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 20.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 4.484701156616211, "kl": 0.03071892447769642, "learning_rate": 2.97e-06, "loss": 0.0626, "num_tokens": 332978.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 20.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.002247113035991788, "kl": 0.0034552812576293945, "learning_rate": 2.9696666666666667e-06, "loss": 0.0002, "num_tokens": 333214.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.007936508394777775, "clip_ratio/high_mean": 0.007936508394777775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007936508394777775, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 20.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.9538395404815674, "kl": 0.08735976181924343, "learning_rate": 2.9693333333333335e-06, "loss": 0.1933, "num_tokens": 333584.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 76.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 20.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.7457423210144043, "kl": 0.19177453219890594, "learning_rate": 2.9690000000000003e-06, "loss": 0.444, "num_tokens": 334130.0, "reward": 5.050000190734863, "reward_std": 5.900000095367432, "rewards/reward_combined/mean": 5.050000190734863, "rewards/reward_combined/std": 5.90000057220459, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 20.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 6.478559494018555, "kl": 0.03354105446487665, "learning_rate": 2.9686666666666666e-06, "loss": -0.0083, "num_tokens": 334433.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 20.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 8.359942436218262, "kl": 0.01864597573876381, "learning_rate": 2.9683333333333334e-06, "loss": 0.2653, "num_tokens": 334694.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 20.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.5284212231636047, "kl": 0.031490376219153404, "learning_rate": 2.968e-06, "loss": 0.0017, "num_tokens": 335024.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 20.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.002659357152879238, "kl": 0.0008811780717223883, "learning_rate": 2.9676666666666666e-06, "loss": 0.0, "num_tokens": 335304.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 20.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.11540081351995468, "kl": 0.004415854811668396, "learning_rate": 2.9673333333333334e-06, "loss": 0.0002, "num_tokens": 335510.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 20.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.010740313678979874, "kl": 0.0005187153728911653, "learning_rate": 2.967e-06, "loss": 0.0, "num_tokens": 335730.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.564188003540039, "kl": 0.11788275837898254, "learning_rate": 2.966666666666667e-06, "loss": 0.0346, "num_tokens": 336022.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 20.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.10571005940437317, "kl": 0.009991541504859924, "learning_rate": 2.9663333333333333e-06, "loss": 0.0005, "num_tokens": 336282.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 20.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.3314175605773926, "kl": 0.11745191365480423, "learning_rate": 2.966e-06, "loss": 0.0058, "num_tokens": 336607.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 4.558254241943359, "kl": 0.14987434446811676, "learning_rate": 2.965666666666667e-06, "loss": 0.2771, "num_tokens": 336939.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 20.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.010132589377462864, "kl": 0.000619823724264279, "learning_rate": 2.9653333333333336e-06, "loss": 0.0, "num_tokens": 337211.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 20.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.5016937255859375, "kl": 0.037582204677164555, "learning_rate": 2.965e-06, "loss": 0.0958, "num_tokens": 337490.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.5, "frac_reward_zero_std": 0.0, "grad_norm": 10.07843017578125, "kl": 0.018026244826614857, "learning_rate": 2.9646666666666668e-06, "loss": 0.1158, "num_tokens": 337792.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 20.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03571224585175514, "kl": 0.001817658543586731, "learning_rate": 2.9643333333333336e-06, "loss": 0.0001, "num_tokens": 338004.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 20.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.15281568467617035, "kl": 0.017196177504956722, "learning_rate": 2.964e-06, "loss": 0.0008, "num_tokens": 338302.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 20.555555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 1.7225083112716675, "kl": 0.1012876033782959, "learning_rate": 2.9636666666666667e-06, "loss": 0.0157, "num_tokens": 338667.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 20.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.42082706093788147, "kl": 0.031162254512310028, "learning_rate": 2.9633333333333335e-06, "loss": 0.0016, "num_tokens": 338927.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.59259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 4.228433132171631, "kl": 0.020230777096003294, "learning_rate": 2.9630000000000003e-06, "loss": -0.0263, "num_tokens": 339200.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 20.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 6.639192581176758, "kl": 0.09708906058222055, "learning_rate": 2.9626666666666666e-06, "loss": 0.0452, "num_tokens": 339495.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03252008184790611, "kl": 0.003536023898050189, "learning_rate": 2.9623333333333334e-06, "loss": 0.0002, "num_tokens": 339826.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 20.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.016074860468506813, "kl": 0.001966603100299835, "learning_rate": 2.9619999999999998e-06, "loss": 0.0001, "num_tokens": 340042.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 20.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002983392623718828, "kl": 9.645521640777588e-05, "learning_rate": 2.9616666666666665e-06, "loss": 0.0, "num_tokens": 340262.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 20.685185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 2.892916440963745, "kl": 0.005409277277067304, "learning_rate": 2.9613333333333338e-06, "loss": 0.0002, "num_tokens": 340542.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 5.1110639572143555, "kl": 0.12851876206696033, "learning_rate": 2.961e-06, "loss": 0.148, "num_tokens": 340819.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 20.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 10.510836601257324, "kl": 0.036147153936326504, "learning_rate": 2.960666666666667e-06, "loss": 0.2088, "num_tokens": 341063.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 20.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.006750933825969696, "kl": 0.0021908581256866455, "learning_rate": 2.9603333333333333e-06, "loss": 0.0001, "num_tokens": 341323.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.049142204225063324, "kl": 0.0022561585064977407, "learning_rate": 2.96e-06, "loss": 0.0001, "num_tokens": 341631.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.06688922643661499, "kl": 0.008426931453868747, "learning_rate": 2.959666666666667e-06, "loss": 0.0004, "num_tokens": 341960.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.19637010991573334, "kl": 0.015841126441955566, "learning_rate": 2.9593333333333336e-06, "loss": 0.0007, "num_tokens": 342272.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 20.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.058728717267513275, "kl": 0.00821069278754294, "learning_rate": 2.959e-06, "loss": 0.0004, "num_tokens": 342604.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 20.833333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 1.364965796470642, "kl": 0.0349162295460701, "learning_rate": 2.9586666666666667e-06, "loss": 0.0524, "num_tokens": 343035.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 20.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.671823978424072, "kl": 0.019271957222372293, "learning_rate": 2.9583333333333335e-06, "loss": 0.0892, "num_tokens": 343304.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 20.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.5664737224578857, "kl": 0.05136868730187416, "learning_rate": 2.958e-06, "loss": 0.0035, "num_tokens": 343573.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 20.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.004015078768134117, "kl": 0.00041546672582626343, "learning_rate": 2.9576666666666667e-06, "loss": 0.0, "num_tokens": 343785.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 20.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 3.7527806758880615, "kl": 0.07438867166638374, "learning_rate": 2.9573333333333335e-06, "loss": 0.0925, "num_tokens": 344137.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 8.403362274169922, "kl": 0.023505443707108498, "learning_rate": 2.9570000000000002e-06, "loss": 0.1409, "num_tokens": 344424.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 20.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.028070291504263878, "kl": 0.0003178758197464049, "learning_rate": 2.9566666666666666e-06, "loss": 0.0, "num_tokens": 344704.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 20.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.751282215118408, "kl": 0.09102287143468857, "learning_rate": 2.9563333333333334e-06, "loss": -0.0446, "num_tokens": 345032.0, "reward": 4.125, "reward_std": 2.25, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 2.25, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 20.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08037609606981277, "kl": 0.011075216345489025, "learning_rate": 2.9559999999999997e-06, "loss": 0.0006, "num_tokens": 345344.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 21.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08196492493152618, "kl": 0.005020148353651166, "learning_rate": 2.955666666666667e-06, "loss": 0.0003, "num_tokens": 345628.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 21.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.798346042633057, "kl": 0.06818924844264984, "learning_rate": 2.9553333333333337e-06, "loss": -0.009, "num_tokens": 345927.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 21.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.24431799352169037, "kl": 0.04779178276658058, "learning_rate": 2.955e-06, "loss": 0.0023, "num_tokens": 346209.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 21.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.12148759514093399, "kl": 0.01206190837547183, "learning_rate": 2.954666666666667e-06, "loss": 0.0006, "num_tokens": 346513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 21.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.1359328031539917, "kl": 0.012502173180109821, "learning_rate": 2.9543333333333332e-06, "loss": 0.0004, "num_tokens": 346811.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.25, "completions/mean_terminated_length": 2.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 21.09259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 10.664088249206543, "kl": 0.8295629173517227, "learning_rate": 2.954e-06, "loss": 0.1508, "num_tokens": 347024.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 21.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.008147433400154114, "kl": 0.0007927305996417999, "learning_rate": 2.953666666666667e-06, "loss": 0.0, "num_tokens": 347268.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 21.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023207135964185, "kl": 0.016516927629709244, "learning_rate": 2.9533333333333336e-06, "loss": 0.0008, "num_tokens": 347528.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 21.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.23874467611312866, "kl": 0.011130547791253775, "learning_rate": 2.953e-06, "loss": 0.0005, "num_tokens": 347796.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 21.166666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 6.717414855957031, "kl": 0.07371293380856514, "learning_rate": 2.9526666666666667e-06, "loss": 0.0207, "num_tokens": 348089.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 21.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.012407519854605198, "kl": 0.0006310045719146729, "learning_rate": 2.9523333333333335e-06, "loss": 0.0, "num_tokens": 348295.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 59.5, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 59.5, "completions/mean_terminated_length": 59.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 21.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 3.0559470653533936, "kl": 0.07835566624999046, "learning_rate": 2.952e-06, "loss": 0.36, "num_tokens": 348753.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 21.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.008729362860321999, "kl": 0.0004869153199251741, "learning_rate": 2.9516666666666666e-06, "loss": 0.0, "num_tokens": 349025.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 21.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02897246740758419, "kl": 0.006061029154807329, "learning_rate": 2.9513333333333334e-06, "loss": 0.0003, "num_tokens": 349313.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 21.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.3830954134464264, "kl": 0.01743863639421761, "learning_rate": 2.951e-06, "loss": 0.0009, "num_tokens": 349534.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 21.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.05649878457188606, "kl": 0.004728741245344281, "learning_rate": 2.9506666666666666e-06, "loss": 0.0002, "num_tokens": 349851.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 21.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 4.983824253082275, "kl": 0.06738797202706337, "learning_rate": 2.9503333333333333e-06, "loss": 0.2527, "num_tokens": 350175.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.040783047676086426, "kl": 0.005734084872528911, "learning_rate": 2.9499999999999997e-06, "loss": 0.0003, "num_tokens": 350445.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 21.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.10436109453439713, "kl": 0.011295586824417114, "learning_rate": 2.949666666666667e-06, "loss": 0.0006, "num_tokens": 350705.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 21.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.017738359048962593, "kl": 0.0003789237671298906, "learning_rate": 2.9493333333333337e-06, "loss": 0.0, "num_tokens": 350941.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 21.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.230639934539795, "kl": 0.026744220405817032, "learning_rate": 2.949e-06, "loss": 0.0377, "num_tokens": 351229.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 21.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 11.081344604492188, "kl": 0.36256470531225204, "learning_rate": 2.948666666666667e-06, "loss": -0.2009, "num_tokens": 351548.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 21.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.04915767163038254, "kl": 0.005341783398762345, "learning_rate": 2.948333333333333e-06, "loss": 0.0003, "num_tokens": 351885.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 21.425925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.716984748840332, "kl": 0.044003942515701056, "learning_rate": 2.948e-06, "loss": 0.0012, "num_tokens": 352314.0, "reward": 1.5499999523162842, "reward_std": 1.2556538581848145, "rewards/reward_combined/mean": 1.5499999523162842, "rewards/reward_combined/std": 1.2556538581848145, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 21.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.026875631883740425, "kl": 0.006697945529595017, "learning_rate": 2.9476666666666668e-06, "loss": 0.0003, "num_tokens": 352636.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 21.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.012115325778722763, "kl": 0.001387697469908744, "learning_rate": 2.9473333333333335e-06, "loss": 0.0001, "num_tokens": 352896.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 21.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.894453048706055, "kl": 0.020720298402011395, "learning_rate": 2.947e-06, "loss": 0.0279, "num_tokens": 353171.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 21.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.975372791290283, "kl": 0.1301591955125332, "learning_rate": 2.9466666666666667e-06, "loss": 0.1615, "num_tokens": 353496.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 21.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.19771425426006317, "kl": 0.007606446743011475, "learning_rate": 2.9463333333333335e-06, "loss": 0.0004, "num_tokens": 353756.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 21.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.2030460387468338, "kl": 0.018800528720021248, "learning_rate": 2.946e-06, "loss": 0.001, "num_tokens": 354021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 21.555555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 1.497882604598999, "kl": 0.0030176237924024463, "learning_rate": 2.945666666666667e-06, "loss": 0.0004, "num_tokens": 354281.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 21.574074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 7.182275295257568, "kl": 0.014296102803200483, "learning_rate": 2.9453333333333334e-06, "loss": 0.011, "num_tokens": 354612.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 21.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.11543595045804977, "kl": 0.011262784712016582, "learning_rate": 2.945e-06, "loss": 0.0006, "num_tokens": 354905.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 21.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 5.05928897857666, "kl": 0.1998056210577488, "learning_rate": 2.9446666666666665e-06, "loss": 0.0869, "num_tokens": 355259.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 21.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020188773050904274, "kl": 0.0010549085563980043, "learning_rate": 2.9443333333333333e-06, "loss": 0.0001, "num_tokens": 355539.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 21.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01794954389333725, "kl": 0.003737920429557562, "learning_rate": 2.944e-06, "loss": 0.0002, "num_tokens": 355795.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 21.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.08125066757202148, "kl": 0.0040408282075077295, "learning_rate": 2.943666666666667e-06, "loss": 0.0002, "num_tokens": 356109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 21.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.23854506015777588, "kl": 0.005255371332168579, "learning_rate": 2.9433333333333337e-06, "loss": 0.0003, "num_tokens": 356325.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 21.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 10.874303817749023, "kl": 0.07863211538642645, "learning_rate": 2.943e-06, "loss": 0.1966, "num_tokens": 356613.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 21.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 5.840372085571289, "kl": 0.07953836768865585, "learning_rate": 2.942666666666667e-06, "loss": -0.0632, "num_tokens": 356901.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 21.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002117764379363507, "kl": 0.00011216849088668823, "learning_rate": 2.942333333333333e-06, "loss": 0.0, "num_tokens": 357121.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 21.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.825308322906494, "kl": 0.03514570742845535, "learning_rate": 2.942e-06, "loss": -0.1674, "num_tokens": 357462.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.06371951848268509, "kl": 0.010883115697652102, "learning_rate": 2.9416666666666667e-06, "loss": 0.0005, "num_tokens": 357734.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 21.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.3431745767593384, "kl": 0.03118173498660326, "learning_rate": 2.9413333333333335e-06, "loss": 0.0015, "num_tokens": 358038.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.03796914219856262, "kl": 0.007035271963104606, "learning_rate": 2.941e-06, "loss": 0.0004, "num_tokens": 358320.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 21.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.11631365865468979, "kl": 0.012384260538965464, "learning_rate": 2.9406666666666667e-06, "loss": 0.0006, "num_tokens": 358654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 21.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.747598171234131, "kl": 0.16620274633169174, "learning_rate": 2.9403333333333334e-06, "loss": -0.1208, "num_tokens": 358979.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 21.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.3009033203125, "kl": 0.1012641042470932, "learning_rate": 2.9400000000000002e-06, "loss": -0.0134, "num_tokens": 359341.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02373761124908924, "kl": 0.008252784609794617, "learning_rate": 2.939666666666667e-06, "loss": 0.0004, "num_tokens": 359609.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 21.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.04895434156060219, "kl": 0.00742289237678051, "learning_rate": 2.9393333333333334e-06, "loss": 0.0004, "num_tokens": 359921.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 21.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.2559320628643036, "kl": 0.12430773675441742, "learning_rate": 2.939e-06, "loss": 0.0061, "num_tokens": 360252.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 21.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.002980705350637436, "kl": 0.0005115270614624023, "learning_rate": 2.9386666666666665e-06, "loss": 0.0, "num_tokens": 360464.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 21.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.00200333702377975, "kl": 0.0035089924931526184, "learning_rate": 2.9383333333333333e-06, "loss": 0.0002, "num_tokens": 360700.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.5305435061454773, "kl": 0.016145928762853146, "learning_rate": 2.938e-06, "loss": 0.0008, "num_tokens": 360996.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 22.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.9100046157836914, "kl": 0.14727820456027985, "learning_rate": 2.937666666666667e-06, "loss": 0.0545, "num_tokens": 361327.0, "reward": 3.375, "reward_std": 0.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 0.25, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 22.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.003309160703793168, "kl": 0.0005266964435577393, "learning_rate": 2.9373333333333336e-06, "loss": 0.0, "num_tokens": 361539.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 22.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.5165593028068542, "kl": 0.0408717580139637, "learning_rate": 2.937e-06, "loss": 0.002, "num_tokens": 361829.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 22.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.04083992540836334, "kl": 0.0015979359450284392, "learning_rate": 2.9366666666666668e-06, "loss": 0.0001, "num_tokens": 362141.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 22.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.18477961421012878, "kl": 0.00941312313079834, "learning_rate": 2.936333333333333e-06, "loss": 0.0005, "num_tokens": 362357.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 22.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.2500711679458618, "kl": 0.030659684911370277, "learning_rate": 2.936e-06, "loss": 0.0015, "num_tokens": 362686.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 22.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.044258117675781, "kl": 0.00922396220266819, "learning_rate": 2.9356666666666667e-06, "loss": 0.0593, "num_tokens": 363002.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 22.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07881251722574234, "kl": 0.00452762097120285, "learning_rate": 2.9353333333333335e-06, "loss": 0.0002, "num_tokens": 363262.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 22.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.001921364339068532, "kl": 0.016606775112450123, "learning_rate": 2.9350000000000003e-06, "loss": 0.0008, "num_tokens": 363522.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 22.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.034760188311338425, "kl": 0.0016343023162335157, "learning_rate": 2.9346666666666666e-06, "loss": 0.0001, "num_tokens": 363800.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.6671550869941711, "kl": 0.060452768579125404, "learning_rate": 2.9343333333333334e-06, "loss": 0.0038, "num_tokens": 364077.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 22.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.49239590764045715, "kl": 0.028446731623262167, "learning_rate": 2.934e-06, "loss": 0.0019, "num_tokens": 364348.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 22.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.09381739050149918, "kl": 0.006515514440252446, "learning_rate": 2.933666666666667e-06, "loss": 0.0003, "num_tokens": 364583.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 22.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.08807060867547989, "kl": 0.004534887499175966, "learning_rate": 2.9333333333333333e-06, "loss": 0.0002, "num_tokens": 364879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 22.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.19829215109348297, "kl": 0.03475787350907922, "learning_rate": 2.933e-06, "loss": 0.0017, "num_tokens": 365155.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 22.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021598278544843197, "kl": 0.0034804120659828186, "learning_rate": 2.9326666666666665e-06, "loss": 0.0002, "num_tokens": 365391.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 22.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.093035988509655, "kl": 0.043035659939050674, "learning_rate": 2.9323333333333333e-06, "loss": 0.0021, "num_tokens": 365694.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 22.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.03102559968829155, "kl": 0.007453024387359619, "learning_rate": 2.932e-06, "loss": 0.0004, "num_tokens": 365906.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 22.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.06748061627149582, "kl": 0.008585278643295169, "learning_rate": 2.931666666666667e-06, "loss": 0.0004, "num_tokens": 366255.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 22.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06693046540021896, "kl": 0.13564538955688477, "learning_rate": 2.9313333333333336e-06, "loss": 0.0066, "num_tokens": 366571.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 22.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.16144220530986786, "kl": 0.05471273045986891, "learning_rate": 2.931e-06, "loss": 0.0027, "num_tokens": 366863.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 22.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.1028034687042236, "kl": 0.18381104990839958, "learning_rate": 2.9306666666666668e-06, "loss": 0.2272, "num_tokens": 367212.0, "reward": 3.25, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 0.28867512941360474, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 22.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.00020064125419594347, "kl": 0.00011434406042098999, "learning_rate": 2.930333333333333e-06, "loss": 0.0, "num_tokens": 367432.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 22.425925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.1972577571868896, "kl": 0.12325546145439148, "learning_rate": 2.9300000000000003e-06, "loss": -0.003, "num_tokens": 367795.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 22.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 4.3981404304504395, "kl": 0.008597993873991072, "learning_rate": 2.9296666666666667e-06, "loss": -0.0112, "num_tokens": 368129.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 22.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.07383531332015991, "kl": 0.04913156945258379, "learning_rate": 2.9293333333333335e-06, "loss": 0.0023, "num_tokens": 368489.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 22.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.8061723709106445, "kl": 0.017924664542078972, "learning_rate": 2.9290000000000002e-06, "loss": 0.0294, "num_tokens": 368790.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 22.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016739238053560257, "kl": 0.0011711865663528442, "learning_rate": 2.9286666666666666e-06, "loss": 0.0001, "num_tokens": 369070.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 22.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.005845101550221443, "kl": 0.017530377954244614, "learning_rate": 2.9283333333333334e-06, "loss": 0.0009, "num_tokens": 369354.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 22.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.2753112018108368, "kl": 0.04153232462704182, "learning_rate": 2.928e-06, "loss": 0.0021, "num_tokens": 369687.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 22.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.05116073042154312, "kl": 0.002799829700961709, "learning_rate": 2.927666666666667e-06, "loss": 0.0001, "num_tokens": 369947.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.14668123424053192, "kl": 0.01994048012420535, "learning_rate": 2.9273333333333333e-06, "loss": 0.0011, "num_tokens": 370229.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.10535865277051926, "kl": 0.014556014444679022, "learning_rate": 2.927e-06, "loss": 0.0007, "num_tokens": 370502.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 22.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.075160026550293, "kl": 0.023662267718464136, "learning_rate": 2.9266666666666665e-06, "loss": 0.14, "num_tokens": 370849.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 22.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.379148960113525, "kl": 0.07176013104617596, "learning_rate": 2.9263333333333332e-06, "loss": 0.007, "num_tokens": 371141.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 22.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.767480850219727, "kl": 0.3034612610936165, "learning_rate": 2.926e-06, "loss": 0.0001, "num_tokens": 371467.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 22.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 4.768539905548096, "kl": 0.0630022007972002, "learning_rate": 2.925666666666667e-06, "loss": -0.0181, "num_tokens": 371811.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 22.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 1.3131734132766724, "kl": 0.1380967851728201, "learning_rate": 2.9253333333333336e-06, "loss": 0.0074, "num_tokens": 372085.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1225 }, { "clip_ratio/high_max": 0.008620689623057842, "clip_ratio/high_mean": 0.008620689623057842, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008620689623057842, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 22.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 1.9731311798095703, "kl": 0.04745516739785671, "learning_rate": 2.925e-06, "loss": -0.006, "num_tokens": 372431.0, "reward": 5.5, "reward_std": 2.6140644550323486, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 2.6140644550323486, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 22.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.008209033869206905, "kl": 0.0010252483189105988, "learning_rate": 2.9246666666666667e-06, "loss": 0.0001, "num_tokens": 372675.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 22.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005959033966064453, "kl": 0.002459391951560974, "learning_rate": 2.9243333333333335e-06, "loss": 0.0001, "num_tokens": 372935.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 22.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.888219833374023, "kl": 0.012552839703857899, "learning_rate": 2.9240000000000003e-06, "loss": -0.033, "num_tokens": 373225.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 22.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 1.72470223903656, "kl": 0.16640783549519256, "learning_rate": 2.9236666666666667e-06, "loss": 0.0074, "num_tokens": 373444.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 22.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.34498071670532227, "kl": 0.07076587900519371, "learning_rate": 2.9233333333333334e-06, "loss": 0.0035, "num_tokens": 373719.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 22.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.360834002494812, "kl": 0.030715636909008026, "learning_rate": 2.9230000000000002e-06, "loss": 0.0015, "num_tokens": 373991.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 22.833333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 6.977489471435547, "kl": 0.10888610035181046, "learning_rate": 2.9226666666666666e-06, "loss": 0.0273, "num_tokens": 374328.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 22.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.054354228079319, "kl": 0.0016398903680965304, "learning_rate": 2.9223333333333334e-06, "loss": 0.0001, "num_tokens": 374634.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 22.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07077593356370926, "kl": 0.03445947263389826, "learning_rate": 2.922e-06, "loss": 0.0017, "num_tokens": 374946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 22.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07003846764564514, "kl": 0.04414568841457367, "learning_rate": 2.921666666666667e-06, "loss": 0.0021, "num_tokens": 375360.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.17434364557266235, "kl": 0.011220123968087137, "learning_rate": 2.9213333333333333e-06, "loss": 0.0006, "num_tokens": 375658.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.008249103091657162, "kl": 0.0030403323471546173, "learning_rate": 2.921e-06, "loss": 0.0001, "num_tokens": 375930.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 22.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.8902098536491394, "kl": 0.04571262560784817, "learning_rate": 2.9206666666666664e-06, "loss": 0.0023, "num_tokens": 376186.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 22.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.10242968797683716, "kl": 0.009457210544496775, "learning_rate": 2.9203333333333332e-06, "loss": 0.0005, "num_tokens": 376452.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 22.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.7022817730903625, "kl": 0.1087116226553917, "learning_rate": 2.9200000000000004e-06, "loss": 0.0051, "num_tokens": 376774.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 23.0, "frac_reward_zero_std": 0.0, "grad_norm": 35.80710983276367, "kl": 0.08410745114088058, "learning_rate": 2.9196666666666668e-06, "loss": 0.2291, "num_tokens": 376983.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 23.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09875809401273727, "kl": 0.0028782979061361402, "learning_rate": 2.9193333333333336e-06, "loss": 0.0001, "num_tokens": 377239.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 23.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.050571467727422714, "kl": 0.0026585019659250975, "learning_rate": 2.919e-06, "loss": 0.0002, "num_tokens": 377482.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 23.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.039863619953393936, "kl": 0.0019241442787460983, "learning_rate": 2.9186666666666667e-06, "loss": 0.0001, "num_tokens": 377790.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 23.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.08739642798900604, "kl": 0.009353132452815771, "learning_rate": 2.9183333333333335e-06, "loss": 0.0005, "num_tokens": 378092.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 23.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.10864747315645218, "kl": 0.003104984760284424, "learning_rate": 2.9180000000000003e-06, "loss": 0.0002, "num_tokens": 378308.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 23.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06950744986534119, "kl": 0.010574434418231249, "learning_rate": 2.9176666666666666e-06, "loss": 0.0006, "num_tokens": 378643.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 23.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.0016770362854, "kl": 0.0061819166876375675, "learning_rate": 2.9173333333333334e-06, "loss": 0.3682, "num_tokens": 378947.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01288084127008915, "kl": 0.005341643700376153, "learning_rate": 2.917e-06, "loss": 0.0003, "num_tokens": 379215.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 23.166666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 3.228269577026367, "kl": 0.031007222831249237, "learning_rate": 2.9166666666666666e-06, "loss": 0.0062, "num_tokens": 379479.0, "reward": 1.25, "reward_std": 1.5, "rewards/reward_combined/mean": 1.25, "rewards/reward_combined/std": 1.5, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 23.185185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 4.015903472900391, "kl": 0.00810985779389739, "learning_rate": 2.9163333333333333e-06, "loss": 0.1962, "num_tokens": 379812.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 23.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.00765201635658741, "kl": 0.0014866248238831758, "learning_rate": 2.916e-06, "loss": 0.0001, "num_tokens": 380072.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 23.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.11550978571176529, "kl": 0.07717056572437286, "learning_rate": 2.915666666666667e-06, "loss": 0.0039, "num_tokens": 380434.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 23.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.3842141032218933, "kl": 0.056036993861198425, "learning_rate": 2.9153333333333333e-06, "loss": 0.0027, "num_tokens": 380728.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 23.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.3481963276863098, "kl": 0.0385284349322319, "learning_rate": 2.915e-06, "loss": 0.0023, "num_tokens": 381050.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 23.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.2472708374261856, "kl": 0.01906286645680666, "learning_rate": 2.9146666666666664e-06, "loss": 0.0009, "num_tokens": 381273.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 23.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.705979585647583, "kl": 0.043598782271146774, "learning_rate": 2.9143333333333336e-06, "loss": 0.0022, "num_tokens": 381517.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 23.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.003512790659442544, "kl": 0.0031919777393341064, "learning_rate": 2.9140000000000004e-06, "loss": 0.0002, "num_tokens": 381753.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 23.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.018005406484007835, "kl": 0.0005531683564186096, "learning_rate": 2.9136666666666668e-06, "loss": 0.0, "num_tokens": 381965.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.750605821609497, "kl": 0.009492204524576664, "learning_rate": 2.9133333333333335e-06, "loss": 0.0006, "num_tokens": 382237.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 23.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.006020226515829563, "kl": 0.01743131224066019, "learning_rate": 2.913e-06, "loss": 0.0009, "num_tokens": 382521.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 23.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05591482296586037, "kl": 0.0333207193762064, "learning_rate": 2.9126666666666667e-06, "loss": 0.0016, "num_tokens": 382940.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.20767620205879211, "kl": 0.00659352820366621, "learning_rate": 2.9123333333333335e-06, "loss": 0.0003, "num_tokens": 383236.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 23.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.23847971856594086, "kl": 0.08137239515781403, "learning_rate": 2.9120000000000002e-06, "loss": 0.0041, "num_tokens": 383556.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 23.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.36015400290489197, "kl": 0.023563608527183533, "learning_rate": 2.9116666666666666e-06, "loss": 0.0012, "num_tokens": 383764.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 23.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.18822342157363892, "kl": 0.022435004822909832, "learning_rate": 2.9113333333333334e-06, "loss": 0.0011, "num_tokens": 384048.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 23.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.054273031651973724, "kl": 0.004214713117107749, "learning_rate": 2.911e-06, "loss": 0.0002, "num_tokens": 384371.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 23.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.523254871368408, "kl": 0.022931042592972517, "learning_rate": 2.9106666666666665e-06, "loss": -0.0228, "num_tokens": 384699.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 23.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0046770935878157616, "kl": 0.016160299070179462, "learning_rate": 2.9103333333333333e-06, "loss": 0.0008, "num_tokens": 384959.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 23.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.27035823464393616, "kl": 0.04531625285744667, "learning_rate": 2.91e-06, "loss": 0.0024, "num_tokens": 385248.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.33856290578842163, "kl": 0.03045171545818448, "learning_rate": 2.909666666666667e-06, "loss": 0.0018, "num_tokens": 385522.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 23.574074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.5626940727233887, "kl": 0.02123592747375369, "learning_rate": 2.9093333333333332e-06, "loss": 0.1222, "num_tokens": 385905.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 23.59259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 7.170790672302246, "kl": 0.2881488502025604, "learning_rate": 2.909e-06, "loss": 0.0368, "num_tokens": 386208.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 23.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.0086846351623535, "kl": 0.015922888182103634, "learning_rate": 2.9086666666666664e-06, "loss": 0.1219, "num_tokens": 386549.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 23.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.6355563402175903, "kl": 0.1421504244208336, "learning_rate": 2.9083333333333336e-06, "loss": 0.007, "num_tokens": 386888.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 23.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.170419216156006, "kl": 0.029172319918870926, "learning_rate": 2.9080000000000004e-06, "loss": -0.0429, "num_tokens": 387234.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 23.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 4.438594341278076, "kl": 0.07794092409312725, "learning_rate": 2.9076666666666667e-06, "loss": -0.0425, "num_tokens": 387585.0, "reward": 3.75, "reward_std": 2.723355770111084, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 2.723355770111084, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 23.685185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 3.892442464828491, "kl": 0.0718070799484849, "learning_rate": 2.9073333333333335e-06, "loss": 0.1846, "num_tokens": 387892.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 23.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 4.676334381103516, "kl": 0.06440005823969841, "learning_rate": 2.907e-06, "loss": 0.1175, "num_tokens": 388252.0, "reward": 2.375, "reward_std": 2.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 2.25, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 23.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.10585566610097885, "kl": 0.015857995487749577, "learning_rate": 2.9066666666666666e-06, "loss": 0.0008, "num_tokens": 388541.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 23.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.429829120635986, "kl": 0.1935255378484726, "learning_rate": 2.9063333333333334e-06, "loss": 0.0604, "num_tokens": 388841.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 23.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001566532882861793, "kl": 0.00011816620826721191, "learning_rate": 2.9060000000000002e-06, "loss": 0.0, "num_tokens": 389061.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 23.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020859844516962767, "kl": 0.0010903222137130797, "learning_rate": 2.9056666666666666e-06, "loss": 0.0001, "num_tokens": 389341.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 23.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.012579447589814663, "kl": 0.0007582803373225033, "learning_rate": 2.9053333333333334e-06, "loss": 0.0, "num_tokens": 389613.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 23.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.15366342663764954, "kl": 0.054549604654312134, "learning_rate": 2.905e-06, "loss": 0.0028, "num_tokens": 389922.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 23.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.03424161672592163, "kl": 0.0021857281972188503, "learning_rate": 2.9046666666666665e-06, "loss": 0.0001, "num_tokens": 390233.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 23.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.632739543914795, "kl": 0.06287204567342997, "learning_rate": 2.9043333333333337e-06, "loss": 0.2307, "num_tokens": 390643.0, "reward": 4.375, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 3.902456521987915, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 23.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02854452282190323, "kl": 0.010224854573607445, "learning_rate": 2.904e-06, "loss": 0.0005, "num_tokens": 390945.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 23.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07448150217533112, "kl": 0.0120579544454813, "learning_rate": 2.903666666666667e-06, "loss": 0.0006, "num_tokens": 391288.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 23.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 3.4822940826416016, "kl": 0.1271328292787075, "learning_rate": 2.903333333333333e-06, "loss": -0.0378, "num_tokens": 391586.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 23.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05746985971927643, "kl": 0.15698464959859848, "learning_rate": 2.903e-06, "loss": 0.0078, "num_tokens": 391894.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.944444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 3.1779134273529053, "kl": 0.010984099702909589, "learning_rate": 2.9026666666666668e-06, "loss": 0.0229, "num_tokens": 392153.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 23.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.1667739897966385, "kl": 0.011745782569050789, "learning_rate": 2.9023333333333336e-06, "loss": 0.0006, "num_tokens": 392386.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 23.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.017675377428531647, "kl": 0.006357002770528197, "learning_rate": 2.9020000000000003e-06, "loss": 0.0003, "num_tokens": 392658.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 24.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01812010630965233, "kl": 0.003440248081460595, "learning_rate": 2.9016666666666667e-06, "loss": 0.0002, "num_tokens": 392938.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07860206067562103, "kl": 0.009696295484900475, "learning_rate": 2.9013333333333335e-06, "loss": 0.0005, "num_tokens": 393222.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 24.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03346037492156029, "kl": 0.004525701981037855, "learning_rate": 2.901e-06, "loss": 0.0002, "num_tokens": 393494.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 24.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.7806396484375, "kl": 0.0825969516299665, "learning_rate": 2.9006666666666666e-06, "loss": 0.0044, "num_tokens": 393785.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 24.074074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 9.100828170776367, "kl": 0.035566192818805575, "learning_rate": 2.9003333333333334e-06, "loss": -0.0454, "num_tokens": 394051.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 24.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.0176968015730381, "kl": 0.0011995251406915486, "learning_rate": 2.9e-06, "loss": 0.0001, "num_tokens": 394353.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 24.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 6.3940958976745605, "kl": 0.016883139964193106, "learning_rate": 2.8996666666666665e-06, "loss": -0.0027, "num_tokens": 394687.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 24.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.309798002243042, "kl": 0.10009177401661873, "learning_rate": 2.8993333333333333e-06, "loss": 0.0306, "num_tokens": 395091.0, "reward": 1.125, "reward_std": 1.25, "rewards/reward_combined/mean": 1.125, "rewards/reward_combined/std": 1.25, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 24.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10385389626026154, "kl": 0.004034673795104027, "learning_rate": 2.899e-06, "loss": 0.0002, "num_tokens": 395324.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 24.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.1049782857298851, "kl": 0.002744565485045314, "learning_rate": 2.898666666666667e-06, "loss": 0.0001, "num_tokens": 395540.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 24.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.22583812475204468, "kl": 0.035681961104273796, "learning_rate": 2.8983333333333337e-06, "loss": 0.0019, "num_tokens": 395871.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 24.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.002493814332410693, "kl": 0.0010850706021301448, "learning_rate": 2.898e-06, "loss": 0.0001, "num_tokens": 396151.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 24.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.346193313598633, "kl": 0.09355531260371208, "learning_rate": 2.897666666666667e-06, "loss": 0.0463, "num_tokens": 396488.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 24.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00018391605408396572, "kl": 0.00011413544416427612, "learning_rate": 2.897333333333333e-06, "loss": 0.0, "num_tokens": 396708.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 24.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.030608419328927994, "kl": 0.0014388965792022645, "learning_rate": 2.897e-06, "loss": 0.0001, "num_tokens": 397024.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 24.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.261711448431015, "kl": 0.019011598080396652, "learning_rate": 2.8966666666666667e-06, "loss": 0.001, "num_tokens": 397284.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 9.060579299926758, "kl": 0.05696502886712551, "learning_rate": 2.8963333333333335e-06, "loss": 0.0357, "num_tokens": 397560.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 24.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.04632307216525078, "kl": 0.007019456010311842, "learning_rate": 2.8960000000000003e-06, "loss": 0.0004, "num_tokens": 397898.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.333333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 5.684145927429199, "kl": 0.019601126201450825, "learning_rate": 2.8956666666666667e-06, "loss": 0.218, "num_tokens": 398204.0, "reward": 4.625, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 4.308422088623047, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 24.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.6183109283447266, "kl": 0.07152040116488934, "learning_rate": 2.8953333333333335e-06, "loss": 0.0009, "num_tokens": 398588.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 24.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 10.739463806152344, "kl": 0.017192358151078224, "learning_rate": 2.895e-06, "loss": 0.1828, "num_tokens": 398841.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 24.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006905599497258663, "kl": 0.015825394541025162, "learning_rate": 2.8946666666666666e-06, "loss": 0.0008, "num_tokens": 399101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 24.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.03127503767609596, "kl": 0.00217806757427752, "learning_rate": 2.8943333333333334e-06, "loss": 0.0001, "num_tokens": 399362.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 24.425925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 4.607936382293701, "kl": 0.058576663956046104, "learning_rate": 2.894e-06, "loss": 0.1897, "num_tokens": 399682.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 24.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.03317941352725029, "kl": 0.004169125575572252, "learning_rate": 2.893666666666667e-06, "loss": 0.0002, "num_tokens": 399958.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 24.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.015253948979079723, "kl": 0.08897051960229874, "learning_rate": 2.8933333333333333e-06, "loss": 0.0044, "num_tokens": 400324.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 24.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.2393220216035843, "kl": 0.04741102457046509, "learning_rate": 2.893e-06, "loss": 0.0026, "num_tokens": 400691.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.4361139237880707, "kl": 0.06529728788882494, "learning_rate": 2.892666666666667e-06, "loss": 0.0028, "num_tokens": 400989.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 24.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07573315501213074, "kl": 0.012549018487334251, "learning_rate": 2.8923333333333336e-06, "loss": 0.0006, "num_tokens": 401325.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 24.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.003119494765996933, "kl": 0.00016990544827422127, "learning_rate": 2.892e-06, "loss": 0.0, "num_tokens": 401632.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 24.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.1930025815963745, "kl": 0.0218666922301054, "learning_rate": 2.891666666666667e-06, "loss": 0.0011, "num_tokens": 401888.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 24.574074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 4.869344234466553, "kl": 0.18673317320644855, "learning_rate": 2.891333333333333e-06, "loss": 0.2823, "num_tokens": 402241.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 24.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.03775437921285629, "kl": 0.000459328293800354, "learning_rate": 2.891e-06, "loss": 0.0, "num_tokens": 402453.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 24.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.794774055480957, "kl": 0.08449650928378105, "learning_rate": 2.8906666666666667e-06, "loss": -0.0032, "num_tokens": 402803.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 24.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.005466298200190067, "kl": 0.0009227111877407879, "learning_rate": 2.8903333333333335e-06, "loss": 0.0, "num_tokens": 403063.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 24.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044248453341424465, "kl": 9.804964065551758e-05, "learning_rate": 2.8900000000000003e-06, "loss": 0.0, "num_tokens": 403283.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 24.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.004193097818642855, "kl": 0.0031790658831596375, "learning_rate": 2.8896666666666666e-06, "loss": 0.0002, "num_tokens": 403519.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 24.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.03349853679537773, "kl": 0.004807890392839909, "learning_rate": 2.8893333333333334e-06, "loss": 0.0002, "num_tokens": 403831.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 24.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.08382602035999298, "kl": 0.03187142685055733, "learning_rate": 2.8889999999999998e-06, "loss": 0.0016, "num_tokens": 404127.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 24.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.491166591644287, "kl": 0.06389089673757553, "learning_rate": 2.888666666666667e-06, "loss": -0.0448, "num_tokens": 404474.0, "reward": 1.5, "reward_std": 2.2730302810668945, "rewards/reward_combined/mean": 1.5, "rewards/reward_combined/std": 2.2730302810668945, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 24.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.21916526556015015, "kl": 0.0208267355337739, "learning_rate": 2.8883333333333333e-06, "loss": 0.001, "num_tokens": 404737.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 24.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.18290232121944427, "kl": 0.013282960280776024, "learning_rate": 2.888e-06, "loss": 0.0007, "num_tokens": 405063.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 24.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.6394146680831909, "kl": 0.06432866025716066, "learning_rate": 2.887666666666667e-06, "loss": 0.0043, "num_tokens": 405340.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 24.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.16449198126792908, "kl": 0.0064424017909914255, "learning_rate": 2.8873333333333333e-06, "loss": 0.0003, "num_tokens": 405567.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 24.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.41378939151763916, "kl": 0.05045641399919987, "learning_rate": 2.887e-06, "loss": 0.0025, "num_tokens": 405858.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.04150913283228874, "kl": 0.008626286871731281, "learning_rate": 2.886666666666667e-06, "loss": 0.0004, "num_tokens": 406132.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 24.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.189365863800049, "kl": 0.15324417501688004, "learning_rate": 2.8863333333333336e-06, "loss": -0.0299, "num_tokens": 406463.0, "reward": 2.875, "reward_std": 3.8810436725616455, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.8810436725616455, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11873263120651245, "kl": 0.012811253778636456, "learning_rate": 2.886e-06, "loss": 0.0007, "num_tokens": 406751.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 24.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02632307820022106, "kl": 0.005434486083686352, "learning_rate": 2.8856666666666668e-06, "loss": 0.0003, "num_tokens": 407021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 24.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.027666503563523293, "kl": 0.0009456351399421692, "learning_rate": 2.885333333333333e-06, "loss": 0.0, "num_tokens": 407229.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 24.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04538443684577942, "kl": 0.008396757300943136, "learning_rate": 2.885e-06, "loss": 0.0004, "num_tokens": 407517.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 24.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.02309548854827881, "kl": 0.0010463881844771095, "learning_rate": 2.8846666666666667e-06, "loss": 0.0001, "num_tokens": 407831.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008771929889917374, "clip_ratio/low_min": 0.008771929889917374, "clip_ratio/region_mean": 0.008771929889917374, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 24.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.706453800201416, "kl": 0.07297445461153984, "learning_rate": 2.8843333333333335e-06, "loss": 0.0943, "num_tokens": 408167.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 24.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04379303380846977, "kl": 0.004015351412817836, "learning_rate": 2.8840000000000003e-06, "loss": 0.0002, "num_tokens": 408439.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.0, "frac_reward_zero_std": 0.0, "grad_norm": 7.96112060546875, "kl": 0.0038274761172942817, "learning_rate": 2.8836666666666666e-06, "loss": 0.0808, "num_tokens": 408741.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 25.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.13623046875, "kl": 0.01971262990264222, "learning_rate": 2.8833333333333334e-06, "loss": 0.2748, "num_tokens": 409021.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 25.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038380788173526525, "kl": 0.00040895864367485046, "learning_rate": 2.883e-06, "loss": 0.0, "num_tokens": 409265.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 25.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.10849466174840927, "kl": 0.013719635549932718, "learning_rate": 2.882666666666667e-06, "loss": 0.0006, "num_tokens": 409563.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 25.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.004174836911261082, "kl": 0.016189759597182274, "learning_rate": 2.8823333333333333e-06, "loss": 0.0008, "num_tokens": 409823.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 25.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.017169617116451263, "kl": 0.0010407405789010227, "learning_rate": 2.882e-06, "loss": 0.0001, "num_tokens": 410085.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.003414747305214405, "kl": 0.0009467572235735133, "learning_rate": 2.881666666666667e-06, "loss": 0.0, "num_tokens": 410304.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 25.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.9151815176010132, "kl": 0.1177082397043705, "learning_rate": 2.8813333333333332e-06, "loss": 0.0059, "num_tokens": 410608.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 25.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.594345569610596, "kl": 0.03394792787730694, "learning_rate": 2.881e-06, "loss": 0.0302, "num_tokens": 410901.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 25.166666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 4.6811065673828125, "kl": 0.03273935429751873, "learning_rate": 2.880666666666667e-06, "loss": 0.1046, "num_tokens": 411205.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 25.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.056551720947027206, "kl": 0.0023839278146624565, "learning_rate": 2.8803333333333336e-06, "loss": 0.0001, "num_tokens": 411512.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.004546754993498325, "kl": 0.0031198635697364807, "learning_rate": 2.88e-06, "loss": 0.0002, "num_tokens": 411748.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 25.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.902839183807373, "kl": 0.04041749658063054, "learning_rate": 2.8796666666666667e-06, "loss": 0.1691, "num_tokens": 412098.0, "reward": 7.375, "reward_std": 0.25, "rewards/reward_combined/mean": 7.375, "rewards/reward_combined/std": 0.25, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 25.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 8.713037490844727, "kl": 0.015319585800170898, "learning_rate": 2.879333333333333e-06, "loss": 0.102, "num_tokens": 412362.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.16719676554203033, "kl": 0.002106286585330963, "learning_rate": 2.879e-06, "loss": 0.0001, "num_tokens": 412574.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 25.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.3861662745475769, "kl": 0.08083298802375793, "learning_rate": 2.878666666666667e-06, "loss": 0.0041, "num_tokens": 412933.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 25.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.3336963653564453, "kl": 0.10030456259846687, "learning_rate": 2.8783333333333334e-06, "loss": 0.005, "num_tokens": 413277.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 25.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.10147012025117874, "kl": 0.027383566834032536, "learning_rate": 2.8780000000000002e-06, "loss": 0.0014, "num_tokens": 413569.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 25.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.4869394302368164, "kl": 0.07689347770065069, "learning_rate": 2.8776666666666666e-06, "loss": 0.0031, "num_tokens": 413892.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 25.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10416198521852493, "kl": 0.0032396416645497084, "learning_rate": 2.8773333333333334e-06, "loss": 0.0002, "num_tokens": 414108.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 25.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.13489079475402832, "kl": 0.008247362682595849, "learning_rate": 2.877e-06, "loss": 0.0004, "num_tokens": 414374.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 25.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 10.903152465820312, "kl": 0.02432501211296767, "learning_rate": 2.876666666666667e-06, "loss": 0.1839, "num_tokens": 414645.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 1.0227950811386108, "kl": 0.09256695955991745, "learning_rate": 2.8763333333333333e-06, "loss": 0.005, "num_tokens": 414925.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 25.425925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 7.019451141357422, "kl": 0.08019421622157097, "learning_rate": 2.876e-06, "loss": -0.0647, "num_tokens": 415204.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1373 }, { "clip_ratio/high_max": 0.009433962404727936, "clip_ratio/high_mean": 0.009433962404727936, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009433962404727936, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 25.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 6.732794761657715, "kl": 0.06344223394989967, "learning_rate": 2.875666666666667e-06, "loss": 0.0736, "num_tokens": 415518.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 25.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.1445167064666748, "kl": 0.027428697794675827, "learning_rate": 2.8753333333333332e-06, "loss": 0.0017, "num_tokens": 415827.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 25.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.665595054626465, "kl": 0.012714105658233166, "learning_rate": 2.875e-06, "loss": -0.015, "num_tokens": 416112.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 25.5, "frac_reward_zero_std": 0.0, "grad_norm": 5.245586395263672, "kl": 0.011372138047590852, "learning_rate": 2.8746666666666668e-06, "loss": 0.08, "num_tokens": 416438.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 25.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.0409581661224365, "kl": 0.0564742386341095, "learning_rate": 2.8743333333333336e-06, "loss": -0.0114, "num_tokens": 416739.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 25.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 1.3304027318954468, "kl": 0.1141563281416893, "learning_rate": 2.874e-06, "loss": 0.006, "num_tokens": 417066.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 25.555555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 5.342610836029053, "kl": 0.09788231551647186, "learning_rate": 2.8736666666666667e-06, "loss": 0.05, "num_tokens": 417381.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 25.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.11588838696479797, "kl": 0.00980787631124258, "learning_rate": 2.873333333333333e-06, "loss": 0.0005, "num_tokens": 417671.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 25.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.022172139957547188, "kl": 0.0008865445852279663, "learning_rate": 2.8730000000000003e-06, "loss": 0.0, "num_tokens": 417881.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06228451803326607, "kl": 0.005677401786670089, "learning_rate": 2.872666666666667e-06, "loss": 0.0003, "num_tokens": 418192.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 25.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.01638312265276909, "kl": 0.0003984219874837436, "learning_rate": 2.8723333333333334e-06, "loss": 0.0, "num_tokens": 418500.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 25.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 7.236784934997559, "kl": 0.06469046536949463, "learning_rate": 2.872e-06, "loss": 0.1034, "num_tokens": 418810.0, "reward": 5.5, "reward_std": 4.6726155281066895, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 4.6726155281066895, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 25.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.045878250151872635, "kl": 0.00161789043340832, "learning_rate": 2.8716666666666666e-06, "loss": 0.0001, "num_tokens": 419088.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 72.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 72.5, "completions/mean_terminated_length": 11.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 25.685185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 3.2424540519714355, "kl": 0.008048801682889462, "learning_rate": 2.8713333333333333e-06, "loss": 0.4699, "num_tokens": 419598.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 25.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 1.9747731685638428, "kl": 0.0548938550055027, "learning_rate": 2.871e-06, "loss": -0.0055, "num_tokens": 420002.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.10652562975883484, "kl": 0.012743937084451318, "learning_rate": 2.870666666666667e-06, "loss": 0.0006, "num_tokens": 420282.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 25.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.012463857419788837, "kl": 0.0007106041011866182, "learning_rate": 2.8703333333333333e-06, "loss": 0.0, "num_tokens": 420554.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 25.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07210277020931244, "kl": 0.009725909680128098, "learning_rate": 2.87e-06, "loss": 0.0005, "num_tokens": 420897.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 25.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.02645871229469776, "kl": 0.0010656331141944975, "learning_rate": 2.869666666666667e-06, "loss": 0.0001, "num_tokens": 421132.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 25.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.018604299053549767, "kl": 0.09517047926783562, "learning_rate": 2.869333333333333e-06, "loss": 0.0048, "num_tokens": 421496.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.39529165625572205, "kl": 0.025301030604168773, "learning_rate": 2.869e-06, "loss": 0.0014, "num_tokens": 421794.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.017190895974636078, "kl": 0.0019027739763259888, "learning_rate": 2.8686666666666668e-06, "loss": 0.0001, "num_tokens": 422010.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 25.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06255856901407242, "kl": 0.008704939857125282, "learning_rate": 2.8683333333333335e-06, "loss": 0.0004, "num_tokens": 422294.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 25.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.2053355574607849, "kl": 0.025588180869817734, "learning_rate": 2.868e-06, "loss": 0.0014, "num_tokens": 422637.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.007869244553148746, "kl": 0.0044186601880937815, "learning_rate": 2.8676666666666667e-06, "loss": 0.0002, "num_tokens": 422905.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 25.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.022038500756025314, "kl": 0.0031958511099219322, "learning_rate": 2.867333333333333e-06, "loss": 0.0002, "num_tokens": 423217.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 25.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 2.2725090980529785, "kl": 0.057315390557050705, "learning_rate": 2.8670000000000002e-06, "loss": 0.0994, "num_tokens": 423554.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002391482557868585, "kl": 0.00010408461093902588, "learning_rate": 2.866666666666667e-06, "loss": 0.0, "num_tokens": 423774.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 25.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.4480767250061035, "kl": 0.02600930631160736, "learning_rate": 2.8663333333333334e-06, "loss": 0.0394, "num_tokens": 424129.0, "reward": 3.875, "reward_std": 2.688710927963257, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 2.688710927963257, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 25.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.11902312189340591, "kl": 0.013970667496323586, "learning_rate": 2.866e-06, "loss": 0.0007, "num_tokens": 424431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 26.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.12946157157421112, "kl": 0.02108490839600563, "learning_rate": 2.8656666666666665e-06, "loss": 0.0011, "num_tokens": 424703.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 26.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.329207420349121, "kl": 0.08236123993992805, "learning_rate": 2.8653333333333333e-06, "loss": -0.1037, "num_tokens": 425047.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 26.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.02120780199766159, "kl": 0.003914693836122751, "learning_rate": 2.865e-06, "loss": 0.0002, "num_tokens": 425383.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 26.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.03202745318412781, "kl": 0.0058736191131174564, "learning_rate": 2.864666666666667e-06, "loss": 0.0003, "num_tokens": 425689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 26.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.018070092424750328, "kl": 0.0010024543153122067, "learning_rate": 2.8643333333333332e-06, "loss": 0.0001, "num_tokens": 426010.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 26.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.11457064002752304, "kl": 0.013781537534669042, "learning_rate": 2.864e-06, "loss": 0.0007, "num_tokens": 426340.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 26.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.006577032618224621, "kl": 0.01581774465739727, "learning_rate": 2.863666666666667e-06, "loss": 0.0008, "num_tokens": 426600.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 26.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.11055237799882889, "kl": 0.01578545314259827, "learning_rate": 2.863333333333333e-06, "loss": 0.0008, "num_tokens": 426872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09204570204019547, "kl": 0.005240729544311762, "learning_rate": 2.8630000000000004e-06, "loss": 0.0002, "num_tokens": 427126.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002377243945375085, "kl": 0.00010420382022857666, "learning_rate": 2.8626666666666667e-06, "loss": 0.0, "num_tokens": 427346.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.75, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 26.185185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 2.6935038566589355, "kl": 0.05672331899404526, "learning_rate": 2.8623333333333335e-06, "loss": -0.2265, "num_tokens": 427785.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.006571251433342695, "kl": 0.0027382224798202515, "learning_rate": 2.862e-06, "loss": 0.0001, "num_tokens": 428021.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.24043260514736176, "kl": 0.018282222794368863, "learning_rate": 2.8616666666666667e-06, "loss": 0.0009, "num_tokens": 428320.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.008064515888690948, "clip_ratio/high_mean": 0.008064515888690948, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008064515888690948, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 26.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.4426515102386475, "kl": 0.02955270535312593, "learning_rate": 2.8613333333333334e-06, "loss": -0.0004, "num_tokens": 428749.0, "reward": 2.5999999046325684, "reward_std": 0.4618801772594452, "rewards/reward_combined/mean": 2.5999999046325684, "rewards/reward_combined/std": 0.4618801772594452, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 26.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.11867247521877289, "kl": 0.010655163321644068, "learning_rate": 2.8610000000000002e-06, "loss": 0.0007, "num_tokens": 429097.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 26.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.604410707950592, "kl": 0.04736524447798729, "learning_rate": 2.860666666666667e-06, "loss": 0.0024, "num_tokens": 429357.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.16216279566287994, "kl": 0.0031636282801628113, "learning_rate": 2.8603333333333334e-06, "loss": 0.0002, "num_tokens": 429571.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 26.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.09082437306642532, "kl": 0.01961122266948223, "learning_rate": 2.86e-06, "loss": 0.001, "num_tokens": 429867.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 26.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.10625924915075302, "kl": 0.16769862174987793, "learning_rate": 2.8596666666666665e-06, "loss": 0.0084, "num_tokens": 430179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 26.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04638618603348732, "kl": 0.0008038555170060135, "learning_rate": 2.8593333333333333e-06, "loss": 0.0, "num_tokens": 430401.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 26.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11778359115123749, "kl": 0.019276143983006477, "learning_rate": 2.859e-06, "loss": 0.001, "num_tokens": 430675.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.3700369894504547, "kl": 0.01605492690578103, "learning_rate": 2.858666666666667e-06, "loss": 0.0008, "num_tokens": 430889.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 26.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.021307064220309258, "kl": 0.0946941003203392, "learning_rate": 2.8583333333333332e-06, "loss": 0.0047, "num_tokens": 431253.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 26.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.07501452416181564, "kl": 0.009454557904973626, "learning_rate": 2.858e-06, "loss": 0.0005, "num_tokens": 431545.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 26.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 3.2175190448760986, "kl": 0.033329762518405914, "learning_rate": 2.8576666666666668e-06, "loss": 0.0431, "num_tokens": 431836.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 26.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03348877653479576, "kl": 0.0033172527328133583, "learning_rate": 2.8573333333333336e-06, "loss": 0.0002, "num_tokens": 432148.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 26.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07658609747886658, "kl": 0.05102025344967842, "learning_rate": 2.8570000000000003e-06, "loss": 0.0026, "num_tokens": 432446.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 26.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03014654852449894, "kl": 0.004795238608494401, "learning_rate": 2.8566666666666667e-06, "loss": 0.0002, "num_tokens": 432735.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 26.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.11690176278352737, "kl": 0.0044711134396493435, "learning_rate": 2.8563333333333335e-06, "loss": 0.0002, "num_tokens": 432955.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 26.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03666602075099945, "kl": 0.0011766999959945679, "learning_rate": 2.856e-06, "loss": 0.0001, "num_tokens": 433199.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 26.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0306557547301054, "kl": 0.0009726583957672119, "learning_rate": 2.8556666666666666e-06, "loss": 0.0, "num_tokens": 433455.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 26.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.1592872142791748, "kl": 0.06260843947529793, "learning_rate": 2.8553333333333334e-06, "loss": 0.0031, "num_tokens": 433757.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 26.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.9480538964271545, "kl": 0.061230313032865524, "learning_rate": 2.855e-06, "loss": 0.0031, "num_tokens": 434019.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 26.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 6.19640588760376, "kl": 0.023320306092500687, "learning_rate": 2.854666666666667e-06, "loss": 0.1597, "num_tokens": 434301.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 26.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.577958583831787, "kl": 0.007525532506406307, "learning_rate": 2.8543333333333333e-06, "loss": 0.0031, "num_tokens": 434599.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 26.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.988666534423828, "kl": 0.03583371452987194, "learning_rate": 2.854e-06, "loss": 0.0964, "num_tokens": 434919.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 26.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 8.915569305419922, "kl": 0.030587024171836674, "learning_rate": 2.8536666666666665e-06, "loss": 0.0895, "num_tokens": 435158.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 84.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 84.75, "completions/mean_terminated_length": 27.666667938232422, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 26.685185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 1.9425950050354004, "kl": 0.006271434482187033, "learning_rate": 2.8533333333333333e-06, "loss": 0.3834, "num_tokens": 435725.0, "reward": 5.300000190734863, "reward_std": 5.399999618530273, "rewards/reward_combined/mean": 5.300000190734863, "rewards/reward_combined/std": 5.40000057220459, "step": 1441 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 26.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 5.355051517486572, "kl": 0.011743251234292984, "learning_rate": 2.853e-06, "loss": 0.1324, "num_tokens": 436030.0, "reward": 6.25, "reward_std": 2.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 2.5, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 26.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.06819178909063339, "kl": 0.0028445011121220887, "learning_rate": 2.852666666666667e-06, "loss": 0.0001, "num_tokens": 436300.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 26.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.034059494733810425, "kl": 0.002107149106450379, "learning_rate": 2.8523333333333336e-06, "loss": 0.0001, "num_tokens": 436562.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 26.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.053593143820762634, "kl": 0.006958060432225466, "learning_rate": 2.852e-06, "loss": 0.0003, "num_tokens": 436891.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 26.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 4.85930061340332, "kl": 0.055007945746183395, "learning_rate": 2.8516666666666668e-06, "loss": 0.0497, "num_tokens": 437187.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 26.796296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 1.4209827184677124, "kl": 0.006550558842718601, "learning_rate": 2.8513333333333335e-06, "loss": -0.0444, "num_tokens": 437517.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 26.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.053395211696624756, "kl": 0.0027509289793670177, "learning_rate": 2.8510000000000003e-06, "loss": 0.0001, "num_tokens": 437781.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.009259259328246117, "clip_ratio/high_mean": 0.009259259328246117, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009259259328246117, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 26.833333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 5.038480758666992, "kl": 0.08235517889261246, "learning_rate": 2.8506666666666667e-06, "loss": -0.0701, "num_tokens": 438111.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 26.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.007691516540944576, "kl": 0.0002653861738508567, "learning_rate": 2.8503333333333335e-06, "loss": 0.0, "num_tokens": 438419.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 26.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.201418399810791, "kl": 0.0898846909403801, "learning_rate": 2.85e-06, "loss": 0.0233, "num_tokens": 438754.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 26.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.031339455395936966, "kl": 0.0016578052891418338, "learning_rate": 2.8496666666666666e-06, "loss": 0.0001, "num_tokens": 439030.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 26.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.18257854878902435, "kl": 0.05395127087831497, "learning_rate": 2.8493333333333334e-06, "loss": 0.0026, "num_tokens": 439382.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 26.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.14778709411621094, "kl": 0.022392653860151768, "learning_rate": 2.849e-06, "loss": 0.0011, "num_tokens": 439656.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 26.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020430092699825764, "kl": 0.0010884659131988883, "learning_rate": 2.848666666666667e-06, "loss": 0.0001, "num_tokens": 439936.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 26.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 10.953999519348145, "kl": 0.020798705518245697, "learning_rate": 2.8483333333333333e-06, "loss": 0.0803, "num_tokens": 440213.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 26.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.174964427947998, "kl": 0.008955185767263174, "learning_rate": 2.848e-06, "loss": 0.0011, "num_tokens": 440517.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 27.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.023876987397670746, "kl": 0.004471091320738196, "learning_rate": 2.8476666666666665e-06, "loss": 0.0002, "num_tokens": 440801.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 27.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04079962894320488, "kl": 0.001997561543248594, "learning_rate": 2.8473333333333337e-06, "loss": 0.0001, "num_tokens": 441108.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 1.3792517185211182, "kl": 0.12301432993263006, "learning_rate": 2.847e-06, "loss": 0.0062, "num_tokens": 441404.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 27.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.037380483001470566, "kl": 0.0010350601805839688, "learning_rate": 2.846666666666667e-06, "loss": 0.0001, "num_tokens": 441637.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 27.074074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 5.035538196563721, "kl": 0.13028479367494583, "learning_rate": 2.8463333333333336e-06, "loss": 0.058, "num_tokens": 441971.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 27.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.08378343284130096, "kl": 0.045418718829751015, "learning_rate": 2.846e-06, "loss": 0.0023, "num_tokens": 442273.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 27.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02732064388692379, "kl": 0.0015994884306564927, "learning_rate": 2.8456666666666667e-06, "loss": 0.0001, "num_tokens": 442535.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 27.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1321711540222168, "kl": 0.039184389635920525, "learning_rate": 2.8453333333333335e-06, "loss": 0.002, "num_tokens": 442831.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 27.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.005658379755914211, "kl": 0.0014997664839029312, "learning_rate": 2.8450000000000003e-06, "loss": 0.0001, "num_tokens": 443143.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 27.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.07467425614595413, "kl": 0.01969664730131626, "learning_rate": 2.8446666666666666e-06, "loss": 0.001, "num_tokens": 443465.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 27.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.08769779652357101, "kl": 0.01807217695750296, "learning_rate": 2.8443333333333334e-06, "loss": 0.0009, "num_tokens": 443751.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 27.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.03231378272175789, "kl": 0.004902600310742855, "learning_rate": 2.844e-06, "loss": 0.0002, "num_tokens": 444056.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 27.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.08490697294473648, "kl": 0.017958096228539944, "learning_rate": 2.8436666666666666e-06, "loss": 0.0009, "num_tokens": 444383.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 27.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.018972262740135193, "kl": 0.09514028578996658, "learning_rate": 2.8433333333333334e-06, "loss": 0.0048, "num_tokens": 444747.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 27.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.058594103902578354, "kl": 0.0005277901946101338, "learning_rate": 2.843e-06, "loss": 0.0, "num_tokens": 444961.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 27.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.33628398180007935, "kl": 0.04409374576061964, "learning_rate": 2.842666666666667e-06, "loss": 0.0022, "num_tokens": 445287.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 27.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.14954088628292084, "kl": 0.006559070199728012, "learning_rate": 2.8423333333333333e-06, "loss": 0.0003, "num_tokens": 445547.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.314814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 7.589659214019775, "kl": 0.05561095476150513, "learning_rate": 2.842e-06, "loss": -0.037, "num_tokens": 445834.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 27.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.012504206039011478, "kl": 0.008804184384644032, "learning_rate": 2.8416666666666664e-06, "loss": 0.0004, "num_tokens": 446106.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09010161459445953, "kl": 0.003900033188983798, "learning_rate": 2.8413333333333336e-06, "loss": 0.0002, "num_tokens": 446370.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 27.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00025419285520911217, "kl": 0.00010056048631668091, "learning_rate": 2.841e-06, "loss": 0.0, "num_tokens": 446590.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 27.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 9.483871459960938, "kl": 0.028468238189816475, "learning_rate": 2.8406666666666668e-06, "loss": -0.0025, "num_tokens": 446862.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 27.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 3.8909201622009277, "kl": 0.06551635637879372, "learning_rate": 2.8403333333333336e-06, "loss": 0.0045, "num_tokens": 447197.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.019390955567359924, "kl": 0.008523456286638975, "learning_rate": 2.84e-06, "loss": 0.0004, "num_tokens": 447465.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 27.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.002513324609026313, "kl": 0.0009862482838798314, "learning_rate": 2.8396666666666667e-06, "loss": 0.0, "num_tokens": 447745.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 27.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.6288270950317383, "kl": 0.05403287336230278, "learning_rate": 2.8393333333333335e-06, "loss": 0.1173, "num_tokens": 448094.0, "reward": 4.5, "reward_std": 2.0, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.0, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 27.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0399010106921196, "kl": 0.003508269786834717, "learning_rate": 2.8390000000000003e-06, "loss": 0.0002, "num_tokens": 448306.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 27.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.1594657003879547, "kl": 0.01845983834937215, "learning_rate": 2.8386666666666666e-06, "loss": 0.0009, "num_tokens": 448606.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 27.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.020616723224520683, "kl": 0.005141196423210204, "learning_rate": 2.8383333333333334e-06, "loss": 0.0003, "num_tokens": 448897.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 27.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.6328845620155334, "kl": 0.07485029846429825, "learning_rate": 2.8379999999999998e-06, "loss": 0.0037, "num_tokens": 449157.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 27.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.12431022524833679, "kl": 0.041104961186647415, "learning_rate": 2.8376666666666665e-06, "loss": 0.002, "num_tokens": 449574.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 27.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.006374900694936514, "kl": 0.0027919337153434753, "learning_rate": 2.8373333333333338e-06, "loss": 0.0001, "num_tokens": 449810.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 27.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.0332266241312027, "kl": 0.02783461380749941, "learning_rate": 2.837e-06, "loss": 0.0014, "num_tokens": 450164.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 27.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.20492404699325562, "kl": 0.05932088941335678, "learning_rate": 2.836666666666667e-06, "loss": 0.0029, "num_tokens": 450464.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 27.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 12.562373161315918, "kl": 0.028178778651636094, "learning_rate": 2.8363333333333333e-06, "loss": -0.1328, "num_tokens": 450682.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 27.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03149878606200218, "kl": 0.005483974236994982, "learning_rate": 2.836e-06, "loss": 0.0003, "num_tokens": 450971.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 27.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 26.841873168945312, "kl": 4.637267604470253, "learning_rate": 2.835666666666667e-06, "loss": 0.3793, "num_tokens": 451189.0, "reward": 2.5, "reward_std": 3.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 27.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.08196630328893661, "kl": 0.003600056399591267, "learning_rate": 2.8353333333333336e-06, "loss": 0.0002, "num_tokens": 451437.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 27.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.5261520147323608, "kl": 0.1114635318517685, "learning_rate": 2.835e-06, "loss": 0.0056, "num_tokens": 451731.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.04770840331912041, "kl": 0.005972094601020217, "learning_rate": 2.8346666666666667e-06, "loss": 0.0003, "num_tokens": 452013.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 27.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.1798420548439026, "kl": 0.01135201659053564, "learning_rate": 2.8343333333333335e-06, "loss": 0.0005, "num_tokens": 452256.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 27.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.939312934875488, "kl": 0.019981331191956997, "learning_rate": 2.834e-06, "loss": 0.0506, "num_tokens": 452602.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 27.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.25883594155311584, "kl": 0.01398910884745419, "learning_rate": 2.8336666666666667e-06, "loss": 0.0007, "num_tokens": 452858.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 27.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.043560244143009186, "kl": 0.004490455146878958, "learning_rate": 2.8333333333333335e-06, "loss": 0.0002, "num_tokens": 453192.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 27.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.01911994256079197, "kl": 0.000843496760353446, "learning_rate": 2.8330000000000002e-06, "loss": 0.0, "num_tokens": 453509.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.014285714365541935, "clip_ratio/high_mean": 0.014285714365541935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014285714365541935, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 27.833333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 6.820046901702881, "kl": 0.11318179219961166, "learning_rate": 2.8326666666666666e-06, "loss": -0.162, "num_tokens": 453816.0, "reward": 2.375, "reward_std": 1.8874585628509521, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.8874585628509521, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07642263919115067, "kl": 0.010808147490024567, "learning_rate": 2.8323333333333334e-06, "loss": 0.0006, "num_tokens": 454088.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 27.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.006149383261799812, "kl": 0.015830593183636665, "learning_rate": 2.8319999999999997e-06, "loss": 0.0008, "num_tokens": 454348.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 27.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.720034599304199, "kl": 0.07616878300905228, "learning_rate": 2.831666666666667e-06, "loss": 0.0938, "num_tokens": 454684.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 27.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.05332932248711586, "kl": 0.1629938930273056, "learning_rate": 2.8313333333333337e-06, "loss": 0.0082, "num_tokens": 454993.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 27.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04099570959806442, "kl": 0.0013543331297114491, "learning_rate": 2.831e-06, "loss": 0.0001, "num_tokens": 455269.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 27.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.038438763469457626, "kl": 0.0010230416955891997, "learning_rate": 2.830666666666667e-06, "loss": 0.0, "num_tokens": 455539.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 27.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0880391076207161, "kl": 0.002278156578540802, "learning_rate": 2.8303333333333332e-06, "loss": 0.0001, "num_tokens": 455747.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 8.167452812194824, "kl": 0.006329163908958435, "learning_rate": 2.83e-06, "loss": -0.0231, "num_tokens": 456022.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 28.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04484656825661659, "kl": 0.0024207322858273983, "learning_rate": 2.829666666666667e-06, "loss": 0.0001, "num_tokens": 456333.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 28.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.012770017609000206, "kl": 0.0002797568013193086, "learning_rate": 2.8293333333333336e-06, "loss": 0.0, "num_tokens": 456640.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 28.037037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.8218302726745605, "kl": 0.0563189834356308, "learning_rate": 2.829e-06, "loss": 0.0101, "num_tokens": 456976.0, "reward": 2.0, "reward_std": 2.345207929611206, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 2.345207929611206, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 28.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.07622843235731125, "kl": 0.009533829987049103, "learning_rate": 2.8286666666666667e-06, "loss": 0.0005, "num_tokens": 457269.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 28.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.00024697667686268687, "kl": 0.00010000169277191162, "learning_rate": 2.8283333333333335e-06, "loss": 0.0, "num_tokens": 457489.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 28.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.059532761573791504, "kl": 0.0007642433047294617, "learning_rate": 2.828e-06, "loss": 0.0, "num_tokens": 457697.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 28.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.14826706051826477, "kl": 0.016935094725340605, "learning_rate": 2.8276666666666666e-06, "loss": 0.0009, "num_tokens": 458030.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 28.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.900506973266602, "kl": 0.09935533255338669, "learning_rate": 2.8273333333333334e-06, "loss": 0.0443, "num_tokens": 458398.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 28.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.054693274199962616, "kl": 0.049269139766693115, "learning_rate": 2.827e-06, "loss": 0.0025, "num_tokens": 458726.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 28.166666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 10.160628318786621, "kl": 0.017831362085416913, "learning_rate": 2.8266666666666666e-06, "loss": 0.15, "num_tokens": 458995.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 28.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.011075237765908241, "kl": 0.00912595959380269, "learning_rate": 2.8263333333333333e-06, "loss": 0.0005, "num_tokens": 459267.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 28.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.016902657225728035, "kl": 0.0006451904773712158, "learning_rate": 2.8259999999999997e-06, "loss": 0.0, "num_tokens": 459527.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1523 }, { "clip_ratio/high_max": 0.01515151560306549, "clip_ratio/high_mean": 0.01515151560306549, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01515151560306549, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.7644505500793457, "kl": 0.0067447873298078775, "learning_rate": 2.825666666666667e-06, "loss": 0.0134, "num_tokens": 459843.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 28.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.022648751735687256, "kl": 0.0008592535159550607, "learning_rate": 2.8253333333333337e-06, "loss": 0.0, "num_tokens": 460160.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 28.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.156835079193115, "kl": 0.0632067397236824, "learning_rate": 2.825e-06, "loss": 0.0063, "num_tokens": 460475.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 28.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.04755140841007233, "kl": 0.003388937722775154, "learning_rate": 2.824666666666667e-06, "loss": 0.0002, "num_tokens": 460745.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 28.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.026772283017635345, "kl": 0.008730332367122173, "learning_rate": 2.824333333333333e-06, "loss": 0.0004, "num_tokens": 461046.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 28.314814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 3.9703738689422607, "kl": 0.08144580945372581, "learning_rate": 2.824e-06, "loss": 0.0628, "num_tokens": 461403.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 28.333333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 7.690491199493408, "kl": 0.04787398502230644, "learning_rate": 2.8236666666666668e-06, "loss": 0.0311, "num_tokens": 461725.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 28.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009542643092572689, "kl": 0.0005669295787811279, "learning_rate": 2.8233333333333335e-06, "loss": 0.0, "num_tokens": 461941.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.3834571838378906, "kl": 0.009222902008332312, "learning_rate": 2.823e-06, "loss": -0.0002, "num_tokens": 462237.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 28.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.005147993564605713, "kl": 0.01599263586103916, "learning_rate": 2.8226666666666667e-06, "loss": 0.0008, "num_tokens": 462497.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 28.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 7.021935939788818, "kl": 0.004281937421183102, "learning_rate": 2.8223333333333335e-06, "loss": 0.0933, "num_tokens": 462719.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 28.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.060498569160699844, "kl": 0.02687366772443056, "learning_rate": 2.822e-06, "loss": 0.0014, "num_tokens": 463078.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 28.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.05655942112207413, "kl": 0.0014583574957214296, "learning_rate": 2.821666666666667e-06, "loss": 0.0001, "num_tokens": 463350.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 28.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.3880438804626465, "kl": 0.10363345220685005, "learning_rate": 2.8213333333333334e-06, "loss": 0.033, "num_tokens": 463655.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 28.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07333117723464966, "kl": 0.01207177247852087, "learning_rate": 2.821e-06, "loss": 0.0006, "num_tokens": 463955.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 28.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.049097783863544464, "kl": 0.16059517115354538, "learning_rate": 2.8206666666666665e-06, "loss": 0.008, "num_tokens": 464265.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 28.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.023787448182702065, "kl": 0.0008872672915458679, "learning_rate": 2.8203333333333333e-06, "loss": 0.0, "num_tokens": 464477.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 28.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.007525818422436714, "kl": 0.0009126712975557894, "learning_rate": 2.82e-06, "loss": 0.0, "num_tokens": 464693.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.08046360313892365, "kl": 0.01114275585860014, "learning_rate": 2.819666666666667e-06, "loss": 0.0006, "num_tokens": 464967.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 28.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0992235317826271, "kl": 0.02111656591296196, "learning_rate": 2.8193333333333337e-06, "loss": 0.0011, "num_tokens": 465261.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 28.59259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 2.259025812149048, "kl": 0.05788859911262989, "learning_rate": 2.819e-06, "loss": 0.3244, "num_tokens": 465622.0, "reward": 3.5, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 4.690415859222412, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06180359795689583, "kl": 0.004265672294422984, "learning_rate": 2.818666666666667e-06, "loss": 0.0002, "num_tokens": 465901.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 28.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 1.0129468441009521, "kl": 0.064463309943676, "learning_rate": 2.818333333333333e-06, "loss": 0.0032, "num_tokens": 466137.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03907158598303795, "kl": 0.007112330291420221, "learning_rate": 2.818e-06, "loss": 0.0004, "num_tokens": 466407.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 28.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027319842483848333, "kl": 0.0003667546552605927, "learning_rate": 2.8176666666666667e-06, "loss": 0.0, "num_tokens": 466721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 28.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.0143232187256217, "kl": 0.0007744103495497257, "learning_rate": 2.8173333333333335e-06, "loss": 0.0, "num_tokens": 466977.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 28.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.052229199558496475, "kl": 0.011092833708971739, "learning_rate": 2.817e-06, "loss": 0.0006, "num_tokens": 467304.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.02033820189535618, "kl": 0.006365820998325944, "learning_rate": 2.8166666666666667e-06, "loss": 0.0003, "num_tokens": 467572.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 28.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.019550353288650513, "kl": 0.001157495629740879, "learning_rate": 2.8163333333333334e-06, "loss": 0.0001, "num_tokens": 467834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 28.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.208601951599121, "kl": 0.05314292386174202, "learning_rate": 2.8160000000000002e-06, "loss": 0.0846, "num_tokens": 468123.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 28.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.7484064698219299, "kl": 0.13676604256033897, "learning_rate": 2.815666666666667e-06, "loss": 0.0068, "num_tokens": 468419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 28.796296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 6.701013565063477, "kl": 0.019662877544760704, "learning_rate": 2.8153333333333334e-06, "loss": 0.2003, "num_tokens": 468692.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 28.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0669856145977974, "kl": 0.010359282605350018, "learning_rate": 2.815e-06, "loss": 0.0005, "num_tokens": 469014.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 28.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.02854492887854576, "kl": 0.0026429988211020827, "learning_rate": 2.8146666666666665e-06, "loss": 0.0001, "num_tokens": 469302.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 28.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.12357215583324432, "kl": 0.0030721084913238883, "learning_rate": 2.8143333333333333e-06, "loss": 0.0001, "num_tokens": 469536.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 28.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.7089741826057434, "kl": 0.06357227265834808, "learning_rate": 2.814e-06, "loss": 0.0032, "num_tokens": 469814.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 28.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02073819376528263, "kl": 0.09475615248084068, "learning_rate": 2.813666666666667e-06, "loss": 0.0047, "num_tokens": 470178.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 28.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.18267345428466797, "kl": 0.011012335307896137, "learning_rate": 2.8133333333333336e-06, "loss": 0.0005, "num_tokens": 470421.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 28.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.13044600188732147, "kl": 0.010010089725255966, "learning_rate": 2.813e-06, "loss": 0.0005, "num_tokens": 470688.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.15959681570529938, "kl": 0.010754904244095087, "learning_rate": 2.8126666666666668e-06, "loss": 0.0005, "num_tokens": 470968.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 28.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.010415318422019482, "kl": 0.0018611857667565346, "learning_rate": 2.812333333333333e-06, "loss": 0.0001, "num_tokens": 471280.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 28.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.20058171451091766, "kl": 0.0511362012475729, "learning_rate": 2.812e-06, "loss": 0.0026, "num_tokens": 471585.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 29.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04505414888262749, "kl": 0.05116620287299156, "learning_rate": 2.8116666666666667e-06, "loss": 0.0026, "num_tokens": 471990.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 29.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.008563397452235222, "kl": 0.0004373746196506545, "learning_rate": 2.8113333333333335e-06, "loss": 0.0, "num_tokens": 472225.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 29.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.038594141602516174, "kl": 0.004309385549277067, "learning_rate": 2.8110000000000003e-06, "loss": 0.0002, "num_tokens": 472549.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 29.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.12808479368686676, "kl": 0.009709966834634542, "learning_rate": 2.8106666666666666e-06, "loss": 0.0005, "num_tokens": 472811.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 29.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.09926086664199829, "kl": 0.03737693093717098, "learning_rate": 2.8103333333333334e-06, "loss": 0.0019, "num_tokens": 473161.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 29.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 1.2803255319595337, "kl": 0.167035561054945, "learning_rate": 2.81e-06, "loss": 0.0075, "num_tokens": 473483.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 29.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.206502676010132, "kl": 0.0034317674580961466, "learning_rate": 2.809666666666667e-06, "loss": 0.0138, "num_tokens": 473789.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 29.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08774306625127792, "kl": 0.06965498067438602, "learning_rate": 2.8093333333333333e-06, "loss": 0.0034, "num_tokens": 474200.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 29.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09498246014118195, "kl": 0.010398188140243292, "learning_rate": 2.809e-06, "loss": 0.0005, "num_tokens": 474502.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 29.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.09661281853914261, "kl": 0.013702782180189388, "learning_rate": 2.8086666666666665e-06, "loss": 0.0007, "num_tokens": 474775.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 29.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.022496992722153664, "kl": 0.00807579094544053, "learning_rate": 2.8083333333333333e-06, "loss": 0.0004, "num_tokens": 475079.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 29.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.014664665795862675, "kl": 0.0002637431025505066, "learning_rate": 2.808e-06, "loss": 0.0, "num_tokens": 475291.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 29.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 5.122772693634033, "kl": 0.2681492482079193, "learning_rate": 2.807666666666667e-06, "loss": 0.0488, "num_tokens": 475573.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 29.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.022311396896839142, "kl": 0.0036019354593008757, "learning_rate": 2.8073333333333336e-06, "loss": 0.0002, "num_tokens": 475833.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 29.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.6579437255859375, "kl": 0.0701053871307522, "learning_rate": 2.807e-06, "loss": 0.0038, "num_tokens": 476120.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 29.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.22422263026237488, "kl": 0.075319929048419, "learning_rate": 2.8066666666666668e-06, "loss": 0.0038, "num_tokens": 476433.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 29.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.004530862905085087, "kl": 0.00031580403447151184, "learning_rate": 2.806333333333333e-06, "loss": 0.0, "num_tokens": 476677.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 29.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.03519684821367264, "kl": 0.012216292787343264, "learning_rate": 2.8060000000000003e-06, "loss": 0.0006, "num_tokens": 476961.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 29.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.007863408885896206, "kl": 0.0005006988649256527, "learning_rate": 2.8056666666666667e-06, "loss": 0.0, "num_tokens": 477280.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 29.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04942427575588226, "kl": 0.15612779557704926, "learning_rate": 2.8053333333333335e-06, "loss": 0.0078, "num_tokens": 477591.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 29.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.13410508632659912, "kl": 0.027443756349384785, "learning_rate": 2.8050000000000002e-06, "loss": 0.0014, "num_tokens": 477889.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 29.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.087430953979492, "kl": 0.05423801206052303, "learning_rate": 2.8046666666666666e-06, "loss": -0.0041, "num_tokens": 478240.0, "reward": 3.375, "reward_std": 0.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 0.25, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 29.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 5.397153854370117, "kl": 0.04278162680566311, "learning_rate": 2.8043333333333334e-06, "loss": 0.0306, "num_tokens": 478546.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 29.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0318794883787632, "kl": 0.004159346804954112, "learning_rate": 2.804e-06, "loss": 0.0002, "num_tokens": 478837.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 29.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 6.154887676239014, "kl": 0.3436839394271374, "learning_rate": 2.803666666666667e-06, "loss": 0.0825, "num_tokens": 479190.0, "reward": 6.625, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.0966243743896484, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 29.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.07447133213281631, "kl": 0.0019177318026777357, "learning_rate": 2.8033333333333333e-06, "loss": 0.0001, "num_tokens": 479446.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 29.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01250352244824171, "kl": 0.008539030328392982, "learning_rate": 2.803e-06, "loss": 0.0004, "num_tokens": 479718.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 29.5, "frac_reward_zero_std": 0.0, "grad_norm": 9.671222686767578, "kl": 0.03675028495490551, "learning_rate": 2.8026666666666665e-06, "loss": 0.1471, "num_tokens": 479986.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 29.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10601507127285004, "kl": 0.002778945490717888, "learning_rate": 2.8023333333333332e-06, "loss": 0.0001, "num_tokens": 480213.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 29.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.2422441691160202, "kl": 0.06250947341322899, "learning_rate": 2.802e-06, "loss": 0.0034, "num_tokens": 480514.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 29.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.00024913743254728615, "kl": 9.768456220626831e-05, "learning_rate": 2.801666666666667e-06, "loss": 0.0, "num_tokens": 480734.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 29.574074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 5.347248077392578, "kl": 0.13809160143136978, "learning_rate": 2.8013333333333336e-06, "loss": 0.054, "num_tokens": 481062.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 29.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.08372911810874939, "kl": 0.007937636459246278, "learning_rate": 2.801e-06, "loss": 0.0004, "num_tokens": 481392.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 29.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.006388956680893898, "kl": 0.015753159299492836, "learning_rate": 2.8006666666666667e-06, "loss": 0.0008, "num_tokens": 481652.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 29.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004103833809494972, "kl": 0.00015307665307773277, "learning_rate": 2.800333333333333e-06, "loss": 0.0, "num_tokens": 481872.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 29.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.012417380698025227, "kl": 0.0006692036986351013, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "num_tokens": 482132.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 29.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.10515747219324112, "kl": 0.016220704652369022, "learning_rate": 2.7996666666666667e-06, "loss": 0.0009, "num_tokens": 482456.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 29.685185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 6.032846450805664, "kl": 0.051607828587293625, "learning_rate": 2.7993333333333334e-06, "loss": -0.0995, "num_tokens": 482734.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 29.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.009661445394158363, "kl": 0.0017271991819143295, "learning_rate": 2.7990000000000002e-06, "loss": 0.0001, "num_tokens": 483046.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 29.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.2698768079280853, "kl": 0.08649946004152298, "learning_rate": 2.7986666666666666e-06, "loss": 0.0041, "num_tokens": 483403.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 29.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06218300014734268, "kl": 0.006660278420895338, "learning_rate": 2.7983333333333334e-06, "loss": 0.0003, "num_tokens": 483675.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 29.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028913540299981833, "kl": 0.0006885349866934121, "learning_rate": 2.798e-06, "loss": 0.0, "num_tokens": 483935.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 29.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.5006610155105591, "kl": 0.06085568247362971, "learning_rate": 2.797666666666667e-06, "loss": 0.003, "num_tokens": 484226.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 29.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.006548902485519648, "kl": 0.0039421889232471585, "learning_rate": 2.7973333333333333e-06, "loss": 0.0002, "num_tokens": 484494.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 29.814814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 6.824710845947266, "kl": 0.010844754579011351, "learning_rate": 2.797e-06, "loss": 0.253, "num_tokens": 484802.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 29.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.041225120425224304, "kl": 0.0008110776543617249, "learning_rate": 2.7966666666666664e-06, "loss": 0.0001, "num_tokens": 485010.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 29.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04550207778811455, "kl": 0.006921528605744243, "learning_rate": 2.7963333333333332e-06, "loss": 0.0004, "num_tokens": 485346.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 29.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09664245694875717, "kl": 0.02657230943441391, "learning_rate": 2.7960000000000004e-06, "loss": 0.0013, "num_tokens": 485635.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 29.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 6.3565778732299805, "kl": 0.6150377094745636, "learning_rate": 2.7956666666666668e-06, "loss": 0.0308, "num_tokens": 485935.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 29.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 6.4122538566589355, "kl": 0.16321531683206558, "learning_rate": 2.7953333333333336e-06, "loss": -0.0418, "num_tokens": 486209.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 29.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.008500074036419392, "kl": 0.002206355333328247, "learning_rate": 2.795e-06, "loss": 0.0001, "num_tokens": 486445.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 29.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.024119025096297264, "kl": 0.09382932633161545, "learning_rate": 2.7946666666666667e-06, "loss": 0.0047, "num_tokens": 486809.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 29.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712360367178917, "kl": 0.004835872328840196, "learning_rate": 2.7943333333333335e-06, "loss": 0.0002, "num_tokens": 487127.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 29.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08054126054048538, "kl": 0.002262810943648219, "learning_rate": 2.7940000000000003e-06, "loss": 0.0002, "num_tokens": 487343.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.16639643907546997, "kl": 0.00734925945289433, "learning_rate": 2.7936666666666666e-06, "loss": 0.0004, "num_tokens": 487639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 30.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05367019400000572, "kl": 0.04935498721897602, "learning_rate": 2.7933333333333334e-06, "loss": 0.0025, "num_tokens": 487967.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 30.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0482790581882, "kl": 0.005178533028811216, "learning_rate": 2.793e-06, "loss": 0.0002, "num_tokens": 488241.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 30.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.2608712613582611, "kl": 0.014890595804899931, "learning_rate": 2.7926666666666666e-06, "loss": 0.0007, "num_tokens": 488513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 30.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06302923709154129, "kl": 0.03252813499420881, "learning_rate": 2.7923333333333333e-06, "loss": 0.0017, "num_tokens": 488805.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 30.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.037170834839344025, "kl": 0.0008052513003349304, "learning_rate": 2.792e-06, "loss": 0.0, "num_tokens": 489015.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 30.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02708226442337036, "kl": 0.09318660199642181, "learning_rate": 2.791666666666667e-06, "loss": 0.0047, "num_tokens": 489379.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 30.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1872541755437851, "kl": 0.07842371612787247, "learning_rate": 2.7913333333333333e-06, "loss": 0.0034, "num_tokens": 489709.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04665502905845642, "kl": 0.0037679600063711405, "learning_rate": 2.791e-06, "loss": 0.0002, "num_tokens": 489991.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 30.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002486102166585624, "kl": 9.655207395553589e-05, "learning_rate": 2.7906666666666664e-06, "loss": 0.0, "num_tokens": 490211.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 30.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.04373552277684212, "kl": 0.00536091229878366, "learning_rate": 2.7903333333333336e-06, "loss": 0.0003, "num_tokens": 490501.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 30.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 5.721534729003906, "kl": 0.02560626238118857, "learning_rate": 2.7900000000000004e-06, "loss": 0.1521, "num_tokens": 490793.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 30.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.06082050874829292, "kl": 0.00584647711366415, "learning_rate": 2.7896666666666668e-06, "loss": 0.0003, "num_tokens": 491089.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 30.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03735148906707764, "kl": 0.0037579393247142434, "learning_rate": 2.7893333333333335e-06, "loss": 0.0002, "num_tokens": 491359.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 30.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04815084859728813, "kl": 0.0004974707844667137, "learning_rate": 2.789e-06, "loss": 0.0, "num_tokens": 491572.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 30.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.07238934934139252, "kl": 0.002373345196247101, "learning_rate": 2.7886666666666667e-06, "loss": 0.0001, "num_tokens": 491832.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 30.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 9.438929557800293, "kl": 0.008408480149228126, "learning_rate": 2.7883333333333335e-06, "loss": 0.2166, "num_tokens": 492059.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.005664670839905739, "kl": 0.004119423218071461, "learning_rate": 2.7880000000000002e-06, "loss": 0.0002, "num_tokens": 492327.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.011643906123936176, "kl": 0.008878742344677448, "learning_rate": 2.7876666666666666e-06, "loss": 0.0004, "num_tokens": 492599.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 30.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.22290551662445068, "kl": 0.05015936307609081, "learning_rate": 2.7873333333333334e-06, "loss": 0.0025, "num_tokens": 492903.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 30.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00424150051549077, "kl": 0.0003255121409893036, "learning_rate": 2.787e-06, "loss": 0.0, "num_tokens": 493147.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 30.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.18955810368061066, "kl": 0.13256411626935005, "learning_rate": 2.7866666666666665e-06, "loss": 0.0065, "num_tokens": 493458.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 30.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.008014434017241001, "kl": 0.0022009164094924927, "learning_rate": 2.7863333333333333e-06, "loss": 0.0001, "num_tokens": 493694.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05507928878068924, "kl": 0.00199961184989661, "learning_rate": 2.786e-06, "loss": 0.0001, "num_tokens": 493960.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 30.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.019806496798992157, "kl": 0.0007509946881327778, "learning_rate": 2.785666666666667e-06, "loss": 0.0, "num_tokens": 494216.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 30.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04594004154205322, "kl": 0.006491444306448102, "learning_rate": 2.7853333333333332e-06, "loss": 0.0003, "num_tokens": 494546.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.226511150598526, "kl": 0.03469456639140844, "learning_rate": 2.785e-06, "loss": 0.0019, "num_tokens": 494838.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04777868837118149, "kl": 0.013245042180642486, "learning_rate": 2.7846666666666664e-06, "loss": 0.0007, "num_tokens": 495110.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 30.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.025857778266072273, "kl": 0.0008294135332107544, "learning_rate": 2.7843333333333336e-06, "loss": 0.0, "num_tokens": 495322.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 30.537037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 10.455300331115723, "kl": 0.02141680009663105, "learning_rate": 2.7840000000000004e-06, "loss": 0.1231, "num_tokens": 495559.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1649 }, { "clip_ratio/high_max": 0.010204081423580647, "clip_ratio/high_mean": 0.010204081423580647, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010204081423580647, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 30.555555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 3.250657320022583, "kl": 0.0780343022197485, "learning_rate": 2.7836666666666667e-06, "loss": -0.0085, "num_tokens": 495872.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 30.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.1672305315732956, "kl": 0.056687891483306885, "learning_rate": 2.7833333333333335e-06, "loss": 0.0028, "num_tokens": 496209.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 30.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029121653642505407, "kl": 0.0002818405773723498, "learning_rate": 2.783e-06, "loss": 0.0, "num_tokens": 496521.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 30.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.11224111914634705, "kl": 0.014087349642068148, "learning_rate": 2.7826666666666666e-06, "loss": 0.0006, "num_tokens": 496844.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 30.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.0967931747436523, "kl": 0.1443721354007721, "learning_rate": 2.7823333333333334e-06, "loss": -0.0254, "num_tokens": 497188.0, "reward": 3.125, "reward_std": 0.25, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 0.25, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 30.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.3747810423374176, "kl": 0.042956399731338024, "learning_rate": 2.7820000000000002e-06, "loss": 0.002, "num_tokens": 497477.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 30.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.012312564067542553, "kl": 0.0035613150103017688, "learning_rate": 2.7816666666666666e-06, "loss": 0.0002, "num_tokens": 497737.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.05929622799158096, "kl": 0.0042467673774808645, "learning_rate": 2.7813333333333334e-06, "loss": 0.0002, "num_tokens": 498039.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.02958487533032894, "kl": 0.0027180557372048497, "learning_rate": 2.781e-06, "loss": 0.0001, "num_tokens": 498335.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 30.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.03093082830309868, "kl": 0.002050258044619113, "learning_rate": 2.7806666666666665e-06, "loss": 0.0001, "num_tokens": 498597.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 6.855483055114746, "kl": 0.0427514873445034, "learning_rate": 2.7803333333333337e-06, "loss": 0.0471, "num_tokens": 498872.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 30.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.009033478796482086, "kl": 0.015258362051099539, "learning_rate": 2.78e-06, "loss": 0.0008, "num_tokens": 499132.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 30.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 4.0366926193237305, "kl": 0.04934370703995228, "learning_rate": 2.779666666666667e-06, "loss": -0.0764, "num_tokens": 499497.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.796296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 6.5685930252075195, "kl": 0.05688664689660072, "learning_rate": 2.779333333333333e-06, "loss": 0.0684, "num_tokens": 499785.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 30.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.03696705400943756, "kl": 0.003261456935433671, "learning_rate": 2.779e-06, "loss": 0.0002, "num_tokens": 500108.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 30.833333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 2.941387176513672, "kl": 0.018697240389883518, "learning_rate": 2.7786666666666668e-06, "loss": 0.0155, "num_tokens": 500414.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 30.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.030107568949460983, "kl": 0.0052183972438797355, "learning_rate": 2.7783333333333336e-06, "loss": 0.0003, "num_tokens": 500762.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 30.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.573408126831055, "kl": 0.013701325049623847, "learning_rate": 2.7780000000000003e-06, "loss": 0.0426, "num_tokens": 501097.0, "reward": 5.550000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 5.550000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 30.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07573556900024414, "kl": 0.010404855944216251, "learning_rate": 2.7776666666666667e-06, "loss": 0.0005, "num_tokens": 501397.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.011363636702299118, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 30.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 5.8701605796813965, "kl": 0.01183168776333332, "learning_rate": 2.7773333333333335e-06, "loss": 0.0591, "num_tokens": 501735.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.13676707446575165, "kl": 0.047327421605587006, "learning_rate": 2.777e-06, "loss": 0.0024, "num_tokens": 502027.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 30.944444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 2.428872585296631, "kl": 0.049864813685417175, "learning_rate": 2.7766666666666666e-06, "loss": 0.1032, "num_tokens": 502448.0, "reward": 2.125, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 1.4361406564712524, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 30.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.014067181386053562, "kl": 0.0022719979751855135, "learning_rate": 2.7763333333333334e-06, "loss": 0.0001, "num_tokens": 502762.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 30.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.011467641219496727, "kl": 0.0005548670887947083, "learning_rate": 2.776e-06, "loss": 0.0, "num_tokens": 502978.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.17998981475830078, "kl": 0.030927646905183792, "learning_rate": 2.7756666666666665e-06, "loss": 0.0017, "num_tokens": 503245.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 31.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.40665602684021, "kl": 0.07047554478049278, "learning_rate": 2.7753333333333333e-06, "loss": 0.0606, "num_tokens": 503612.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 31.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.04566563665866852, "kl": 0.07811117172241211, "learning_rate": 2.775e-06, "loss": 0.0039, "num_tokens": 503975.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 31.055555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 2.729077100753784, "kl": 0.056703547947108746, "learning_rate": 2.774666666666667e-06, "loss": 0.0821, "num_tokens": 504287.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 31.074074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.4384732246398926, "kl": 0.005358624504879117, "learning_rate": 2.7743333333333337e-06, "loss": -0.021, "num_tokens": 504630.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 31.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.01795472949743271, "kl": 0.0021169992396607995, "learning_rate": 2.774e-06, "loss": 0.0001, "num_tokens": 504920.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 31.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0991416797041893, "kl": 0.003429839853197336, "learning_rate": 2.773666666666667e-06, "loss": 0.0002, "num_tokens": 505140.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 31.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.946234703063965, "kl": 0.042977893725037575, "learning_rate": 2.773333333333333e-06, "loss": 0.0667, "num_tokens": 505470.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 31.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06238122656941414, "kl": 0.006720427889376879, "learning_rate": 2.773e-06, "loss": 0.0003, "num_tokens": 505763.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 31.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003167220565956086, "kl": 8.293986320495605e-05, "learning_rate": 2.7726666666666667e-06, "loss": 0.0, "num_tokens": 505983.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 31.185185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 4.205992698669434, "kl": 0.0023598200641572475, "learning_rate": 2.7723333333333335e-06, "loss": 0.3666, "num_tokens": 506221.0, "reward": 2.625, "reward_std": 2.75, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 2.75, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 31.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.012067733332514763, "kl": 0.0007444173097610474, "learning_rate": 2.7720000000000003e-06, "loss": 0.0, "num_tokens": 506481.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 31.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.11107742041349411, "kl": 0.03276311792433262, "learning_rate": 2.7716666666666667e-06, "loss": 0.0017, "num_tokens": 506785.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04169462248682976, "kl": 0.003586039907531813, "learning_rate": 2.7713333333333335e-06, "loss": 0.0002, "num_tokens": 507053.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 31.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.579318523406982, "kl": 0.08367524109780788, "learning_rate": 2.771e-06, "loss": 0.059, "num_tokens": 507332.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 31.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.010694318450987339, "kl": 0.01508731348440051, "learning_rate": 2.7706666666666666e-06, "loss": 0.0008, "num_tokens": 507592.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 31.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 2.426229476928711, "kl": 0.0468965582549572, "learning_rate": 2.7703333333333334e-06, "loss": -0.0879, "num_tokens": 508014.0, "reward": 1.5499999523162842, "reward_std": 1.2556538581848145, "rewards/reward_combined/mean": 1.5499999523162842, "rewards/reward_combined/std": 1.2556538581848145, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.08689247071743011, "kl": 0.023760899901390076, "learning_rate": 2.77e-06, "loss": 0.0012, "num_tokens": 508280.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 31.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.3459792733192444, "kl": 0.03823373280465603, "learning_rate": 2.769666666666667e-06, "loss": 0.0023, "num_tokens": 508560.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 31.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02969568409025669, "kl": 0.006482162047177553, "learning_rate": 2.7693333333333333e-06, "loss": 0.0003, "num_tokens": 508828.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 31.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.28915145993232727, "kl": 0.04711037874221802, "learning_rate": 2.769e-06, "loss": 0.0026, "num_tokens": 509057.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 31.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.005393181461840868, "kl": 0.002824738621711731, "learning_rate": 2.768666666666667e-06, "loss": 0.0001, "num_tokens": 509293.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 31.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 3.3077964782714844, "kl": 0.089900903403759, "learning_rate": 2.7683333333333337e-06, "loss": -0.2119, "num_tokens": 509636.0, "reward": 3.375, "reward_std": 0.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 0.25, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 31.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.014041605405509472, "kl": 0.0006363093852996826, "learning_rate": 2.768e-06, "loss": 0.0, "num_tokens": 509844.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 31.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.07646717131137848, "kl": 0.014523346908390522, "learning_rate": 2.767666666666667e-06, "loss": 0.0007, "num_tokens": 510176.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 31.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.13709478080272675, "kl": 0.04975891299545765, "learning_rate": 2.767333333333333e-06, "loss": 0.0025, "num_tokens": 510513.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 31.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.1815960556268692, "kl": 0.014224665239453316, "learning_rate": 2.767e-06, "loss": 0.0007, "num_tokens": 510824.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 31.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.163992777466774, "kl": 0.012088237330317497, "learning_rate": 2.7666666666666667e-06, "loss": 0.0006, "num_tokens": 511092.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1701 }, { "clip_ratio/high_max": 0.01515151560306549, "clip_ratio/high_mean": 0.01515151560306549, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01515151560306549, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.297525405883789, "kl": 0.07614044658839703, "learning_rate": 2.7663333333333335e-06, "loss": -0.1242, "num_tokens": 511401.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 31.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.19524890184402466, "kl": 0.022649593651294708, "learning_rate": 2.7660000000000003e-06, "loss": 0.001, "num_tokens": 511693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 31.555555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 4.090134143829346, "kl": 0.04897109046578407, "learning_rate": 2.7656666666666666e-06, "loss": 0.0403, "num_tokens": 511988.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 31.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.19848203659057617, "kl": 0.049016520380973816, "learning_rate": 2.7653333333333334e-06, "loss": 0.0025, "num_tokens": 512292.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012820512987673283, "clip_ratio/low_min": 0.012820512987673283, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 31.59259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 3.2279224395751953, "kl": 0.009315238101407886, "learning_rate": 2.7649999999999998e-06, "loss": -0.1678, "num_tokens": 512605.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 31.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.010073486715555191, "kl": 0.004210499115288258, "learning_rate": 2.764666666666667e-06, "loss": 0.0002, "num_tokens": 512873.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 31.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03109670616686344, "kl": 0.002433445304632187, "learning_rate": 2.7643333333333334e-06, "loss": 0.0001, "num_tokens": 513185.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08693189173936844, "kl": 0.00425832875771448, "learning_rate": 2.764e-06, "loss": 0.0002, "num_tokens": 513441.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 31.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 2.633188486099243, "kl": 0.3348800539970398, "learning_rate": 2.763666666666667e-06, "loss": 0.0498, "num_tokens": 513800.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 31.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.06907296925783157, "kl": 0.016301962081342936, "learning_rate": 2.7633333333333333e-06, "loss": 0.0008, "num_tokens": 514127.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 31.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 9.860721588134766, "kl": 0.021684397011995316, "learning_rate": 2.763e-06, "loss": 0.1494, "num_tokens": 514455.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 31.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.5636835098266602, "kl": 0.050710033625364304, "learning_rate": 2.762666666666667e-06, "loss": 0.0025, "num_tokens": 514750.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1713 }, { "clip_ratio/high_max": 0.006666666828095913, "clip_ratio/high_mean": 0.006666666828095913, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006666666828095913, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 31.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.100022315979004, "kl": 0.0645565465092659, "learning_rate": 2.7623333333333336e-06, "loss": 0.0094, "num_tokens": 515108.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 31.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06662952154874802, "kl": 0.1508898138999939, "learning_rate": 2.762e-06, "loss": 0.0075, "num_tokens": 515417.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 31.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.10787297040224075, "kl": 0.022396287880837917, "learning_rate": 2.7616666666666668e-06, "loss": 0.0011, "num_tokens": 515704.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 31.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.1203509047627449, "kl": 0.014909719116985798, "learning_rate": 2.761333333333333e-06, "loss": 0.0008, "num_tokens": 516013.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.06831133365631104, "kl": 0.004663305822759867, "learning_rate": 2.761e-06, "loss": 0.0002, "num_tokens": 516277.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 31.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0066061001271009445, "kl": 0.0006123781204223633, "learning_rate": 2.7606666666666667e-06, "loss": 0.0, "num_tokens": 516521.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 31.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.299671173095703, "kl": 0.01803624164313078, "learning_rate": 2.7603333333333335e-06, "loss": 0.0027, "num_tokens": 516795.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 31.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.014989754185080528, "kl": 0.004661129554733634, "learning_rate": 2.7600000000000003e-06, "loss": 0.0002, "num_tokens": 517069.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 31.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.021326709538698196, "kl": 0.0007004812359809875, "learning_rate": 2.7596666666666666e-06, "loss": 0.0, "num_tokens": 517281.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.022083792835474014, "kl": 0.0032550841569900513, "learning_rate": 2.7593333333333334e-06, "loss": 0.0002, "num_tokens": 517541.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 31.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.12020346522331238, "kl": 0.04598797671496868, "learning_rate": 2.7589999999999998e-06, "loss": 0.0024, "num_tokens": 517837.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 31.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.26906564831733704, "kl": 0.028645590879023075, "learning_rate": 2.758666666666667e-06, "loss": 0.0014, "num_tokens": 518069.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 31.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.644770622253418, "kl": 0.036845942959189415, "learning_rate": 2.7583333333333333e-06, "loss": 0.0417, "num_tokens": 518401.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 31.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.001290903310291469, "kl": 0.0011155882384628057, "learning_rate": 2.758e-06, "loss": 0.0001, "num_tokens": 518681.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.02473565563559532, "kl": 0.014539883937686682, "learning_rate": 2.757666666666667e-06, "loss": 0.0007, "num_tokens": 518965.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 32.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.836700439453125, "kl": 0.031715997494757175, "learning_rate": 2.7573333333333332e-06, "loss": -0.0034, "num_tokens": 519261.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 32.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08739776164293289, "kl": 0.001209259033203125, "learning_rate": 2.757e-06, "loss": 0.0001, "num_tokens": 519473.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.223323345184326, "kl": 0.026694633066654205, "learning_rate": 2.756666666666667e-06, "loss": -0.0343, "num_tokens": 519762.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 32.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 1.1310564279556274, "kl": 0.10389725491404533, "learning_rate": 2.7563333333333336e-06, "loss": 0.0064, "num_tokens": 520011.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 7.973709583282471, "kl": 0.12362165376543999, "learning_rate": 2.756e-06, "loss": -0.0595, "num_tokens": 520316.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 32.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 5.902437686920166, "kl": 0.053349819034338, "learning_rate": 2.7556666666666667e-06, "loss": 0.3656, "num_tokens": 520552.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.14474616944789886, "kl": 0.008931699441745877, "learning_rate": 2.755333333333333e-06, "loss": 0.0005, "num_tokens": 520819.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 32.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.015790093690156937, "kl": 0.003581728204153478, "learning_rate": 2.755e-06, "loss": 0.0002, "num_tokens": 521099.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 32.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 6.182296276092529, "kl": 0.10936189070343971, "learning_rate": 2.754666666666667e-06, "loss": -0.035, "num_tokens": 521424.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 32.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 1.206444263458252, "kl": 0.10994753241539001, "learning_rate": 2.7543333333333334e-06, "loss": 0.0055, "num_tokens": 521644.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 32.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02155369333922863, "kl": 0.0005641579627990723, "learning_rate": 2.7540000000000002e-06, "loss": 0.0, "num_tokens": 521880.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 32.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.10336758941411972, "kl": 0.038915976881980896, "learning_rate": 2.7536666666666666e-06, "loss": 0.0019, "num_tokens": 522208.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 32.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09731552749872208, "kl": 0.01642657583579421, "learning_rate": 2.7533333333333334e-06, "loss": 0.0009, "num_tokens": 522480.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 32.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.575925827026367, "kl": 0.34525352716445923, "learning_rate": 2.753e-06, "loss": 0.0599, "num_tokens": 522780.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.023094866424798965, "kl": 0.014685933012515306, "learning_rate": 2.752666666666667e-06, "loss": 0.0007, "num_tokens": 523064.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 32.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.0287400484085083, "kl": 0.15829047560691833, "learning_rate": 2.7523333333333333e-06, "loss": -0.0132, "num_tokens": 523430.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 32.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 5.175476551055908, "kl": 0.028773130849003792, "learning_rate": 2.752e-06, "loss": -0.0359, "num_tokens": 523741.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 32.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.021959371864795685, "kl": 0.0035362214548513293, "learning_rate": 2.751666666666667e-06, "loss": 0.0002, "num_tokens": 524001.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 32.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.5844467282295227, "kl": 0.08374781534075737, "learning_rate": 2.7513333333333332e-06, "loss": 0.0047, "num_tokens": 524372.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.25, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 32.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 4.082003116607666, "kl": 0.061678726226091385, "learning_rate": 2.751e-06, "loss": 0.3032, "num_tokens": 524789.0, "reward": 0.6749999523162842, "reward_std": 3.4480671882629395, "rewards/reward_combined/mean": 0.6749999523162842, "rewards/reward_combined/std": 3.4480671882629395, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 32.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0972365215420723, "kl": 0.014674036763608456, "learning_rate": 2.7506666666666668e-06, "loss": 0.0008, "num_tokens": 525099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 32.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.2397584915161133, "kl": 0.042815130203962326, "learning_rate": 2.7503333333333336e-06, "loss": 0.0409, "num_tokens": 525508.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 32.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.2768588960170746, "kl": 0.10103439539670944, "learning_rate": 2.75e-06, "loss": 0.0051, "num_tokens": 525837.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 32.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.10606963187456131, "kl": 0.02139665000140667, "learning_rate": 2.7496666666666667e-06, "loss": 0.0011, "num_tokens": 526174.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 32.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03547282889485359, "kl": 0.005235506920143962, "learning_rate": 2.749333333333333e-06, "loss": 0.0003, "num_tokens": 526464.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 32.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.4175496101379395, "kl": 0.1552463248372078, "learning_rate": 2.7490000000000003e-06, "loss": 0.0445, "num_tokens": 526778.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 32.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04348023980855942, "kl": 0.003014126908965409, "learning_rate": 2.748666666666667e-06, "loss": 0.0002, "num_tokens": 527081.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 32.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.5907437205314636, "kl": 0.032865116372704506, "learning_rate": 2.7483333333333334e-06, "loss": 0.0022, "num_tokens": 527299.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 32.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.109654903411865, "kl": 0.017590856179594994, "learning_rate": 2.748e-06, "loss": 0.1314, "num_tokens": 527630.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 32.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.3732783794403076, "kl": 0.06276243925094604, "learning_rate": 2.7476666666666666e-06, "loss": 0.1708, "num_tokens": 527987.0, "reward": 4.5, "reward_std": 2.345207929611206, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.345207929611206, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 32.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0586363710463047, "kl": 0.002315439283847809, "learning_rate": 2.7473333333333333e-06, "loss": 0.0001, "num_tokens": 528195.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 32.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.10004890710115433, "kl": 0.01631145551800728, "learning_rate": 2.747e-06, "loss": 0.0008, "num_tokens": 528455.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 32.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.23430277407169342, "kl": 0.02228065828603576, "learning_rate": 2.746666666666667e-06, "loss": 0.0011, "num_tokens": 528726.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 32.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.8634611368179321, "kl": 0.051091745495796204, "learning_rate": 2.7463333333333333e-06, "loss": 0.0037, "num_tokens": 528969.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 32.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.07443425059318542, "kl": 0.011200740467756987, "learning_rate": 2.746e-06, "loss": 0.0006, "num_tokens": 529294.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 32.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.08659444749355316, "kl": 0.04875837825238705, "learning_rate": 2.745666666666667e-06, "loss": 0.0024, "num_tokens": 529628.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 32.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015384593280032277, "kl": 0.000998933392111212, "learning_rate": 2.745333333333333e-06, "loss": 0.0, "num_tokens": 529908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 32.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1019458994269371, "kl": 0.007903525372967124, "learning_rate": 2.745e-06, "loss": 0.0004, "num_tokens": 530236.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 32.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.028836917132139206, "kl": 0.002044334774836898, "learning_rate": 2.7446666666666668e-06, "loss": 0.0001, "num_tokens": 530498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 85.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 85.75, "completions/mean_terminated_length": 29.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.466571092605591, "kl": 0.03677182085812092, "learning_rate": 2.7443333333333335e-06, "loss": 0.4235, "num_tokens": 531057.0, "reward": 2.049999952316284, "reward_std": 4.0509257316589355, "rewards/reward_combined/mean": 2.049999952316284, "rewards/reward_combined/std": 4.0509257316589355, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 73.25, "completions/mean_terminated_length": 12.333333969116211, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 32.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.5762598514556885, "kl": 0.013551326934248209, "learning_rate": 2.744e-06, "loss": 0.4315, "num_tokens": 531570.0, "reward": 4.550000190734863, "reward_std": 3.5930488109588623, "rewards/reward_combined/mean": 4.550000190734863, "rewards/reward_combined/std": 3.5930488109588623, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 32.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.07321508228778839, "kl": 0.005755336955189705, "learning_rate": 2.7436666666666667e-06, "loss": 0.0003, "num_tokens": 531893.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0435134693980217, "kl": 0.00424116151407361, "learning_rate": 2.743333333333333e-06, "loss": 0.0002, "num_tokens": 532177.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 32.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 6.904435157775879, "kl": 0.04722149111330509, "learning_rate": 2.7430000000000002e-06, "loss": 0.0164, "num_tokens": 532469.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 32.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.006464575883001089, "kl": 0.01561238057911396, "learning_rate": 2.742666666666667e-06, "loss": 0.0008, "num_tokens": 532729.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 32.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 6.428771018981934, "kl": 0.06700919196009636, "learning_rate": 2.7423333333333334e-06, "loss": 0.0515, "num_tokens": 533039.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 32.870370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 6.432555198669434, "kl": 0.08067074045538902, "learning_rate": 2.742e-06, "loss": 0.0303, "num_tokens": 533335.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 32.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.05425287410616875, "kl": 0.0026105031138285995, "learning_rate": 2.7416666666666665e-06, "loss": 0.0001, "num_tokens": 533646.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 32.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009603277430869639, "kl": 0.0004247836768627167, "learning_rate": 2.7413333333333333e-06, "loss": 0.0, "num_tokens": 533906.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 32.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.011686128564178944, "kl": 0.0008441567624686286, "learning_rate": 2.741e-06, "loss": 0.0, "num_tokens": 534125.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 32.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06443269550800323, "kl": 0.0023990795016288757, "learning_rate": 2.740666666666667e-06, "loss": 0.0001, "num_tokens": 534381.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 32.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03564298525452614, "kl": 0.006909394636750221, "learning_rate": 2.7403333333333332e-06, "loss": 0.0004, "num_tokens": 534651.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 32.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.17307555675506592, "kl": 0.05741347745060921, "learning_rate": 2.74e-06, "loss": 0.0029, "num_tokens": 534958.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 33.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09630684554576874, "kl": 0.011841883417218924, "learning_rate": 2.739666666666667e-06, "loss": 0.0006, "num_tokens": 535231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 33.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.0882561206817627, "kl": 0.04686512239277363, "learning_rate": 2.739333333333333e-06, "loss": 0.4561, "num_tokens": 535742.0, "reward": 5.925000190734863, "reward_std": 4.150000095367432, "rewards/reward_combined/mean": 5.925000190734863, "rewards/reward_combined/std": 4.150000095367432, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 33.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07120820134878159, "kl": 0.16184628754854202, "learning_rate": 2.7390000000000004e-06, "loss": 0.0081, "num_tokens": 536053.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 33.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 9.441292762756348, "kl": 0.08373421430587769, "learning_rate": 2.7386666666666667e-06, "loss": 0.286, "num_tokens": 536287.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 33.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 2.2790210247039795, "kl": 0.15694626420736313, "learning_rate": 2.7383333333333335e-06, "loss": 0.0245, "num_tokens": 536603.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 33.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.47888123989105225, "kl": 0.048293492989614606, "learning_rate": 2.738e-06, "loss": 0.0024, "num_tokens": 536863.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 33.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036101879086345434, "kl": 0.0004414692521095276, "learning_rate": 2.7376666666666667e-06, "loss": 0.0, "num_tokens": 537123.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 33.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.06551062315702438, "kl": 0.013508519157767296, "learning_rate": 2.7373333333333334e-06, "loss": 0.0007, "num_tokens": 537429.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 33.148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 10.506830215454102, "kl": 0.00290696881711483, "learning_rate": 2.7370000000000002e-06, "loss": 0.3031, "num_tokens": 537678.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 33.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.15645378828048706, "kl": 0.013062998652458191, "learning_rate": 2.736666666666667e-06, "loss": 0.0007, "num_tokens": 537894.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 33.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 1.951319932937622, "kl": 0.04044189304113388, "learning_rate": 2.7363333333333334e-06, "loss": -0.0002, "num_tokens": 538298.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 33.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.059244900941848755, "kl": 0.0010513767483644187, "learning_rate": 2.736e-06, "loss": 0.0001, "num_tokens": 538511.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 33.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.014438227750360966, "kl": 0.0013181971735320985, "learning_rate": 2.7356666666666665e-06, "loss": 0.0001, "num_tokens": 538820.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.13680005073547363, "kl": 0.02158356038853526, "learning_rate": 2.7353333333333333e-06, "loss": 0.0012, "num_tokens": 539091.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 33.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.02796652540564537, "kl": 0.09261302649974823, "learning_rate": 2.735e-06, "loss": 0.0046, "num_tokens": 539457.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 33.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 6.050706386566162, "kl": 0.05082953721284866, "learning_rate": 2.734666666666667e-06, "loss": 0.0044, "num_tokens": 539787.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 33.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.560703754425049, "kl": 0.1710280478000641, "learning_rate": 2.7343333333333332e-06, "loss": -0.014, "num_tokens": 540135.0, "reward": 5.375, "reward_std": 2.75, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.75, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 33.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 2.4310381412506104, "kl": 0.04510759375989437, "learning_rate": 2.734e-06, "loss": 0.0159, "num_tokens": 540464.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 33.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.4282565116882324, "kl": 0.07379312999546528, "learning_rate": 2.7336666666666668e-06, "loss": 0.0038, "num_tokens": 540759.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 33.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.15546134114265442, "kl": 0.047861428931355476, "learning_rate": 2.733333333333333e-06, "loss": 0.0028, "num_tokens": 541077.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 33.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 4.939266204833984, "kl": 0.03174888156354427, "learning_rate": 2.7330000000000003e-06, "loss": 0.0013, "num_tokens": 541337.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 33.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.4484003782272339, "kl": 0.023341971449553967, "learning_rate": 2.7326666666666667e-06, "loss": 0.0012, "num_tokens": 541593.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 33.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 4.292440891265869, "kl": 0.2836736887693405, "learning_rate": 2.7323333333333335e-06, "loss": 0.0335, "num_tokens": 541878.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 33.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.6213285326957703, "kl": 0.08234158530831337, "learning_rate": 2.732e-06, "loss": 0.0045, "num_tokens": 542168.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 33.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.045906562358140945, "kl": 0.0012828707695007324, "learning_rate": 2.7316666666666666e-06, "loss": 0.0001, "num_tokens": 542374.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 33.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.24615296721458435, "kl": 0.010953473305562511, "learning_rate": 2.7313333333333334e-06, "loss": 0.0006, "num_tokens": 542596.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.036423616111278534, "kl": 0.002574182115495205, "learning_rate": 2.731e-06, "loss": 0.0001, "num_tokens": 542878.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 33.5, "frac_reward_zero_std": 0.0, "grad_norm": 5.5580596923828125, "kl": 0.04375036060810089, "learning_rate": 2.730666666666667e-06, "loss": 0.0604, "num_tokens": 543207.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005747126415371895, "clip_ratio/low_min": 0.005747126415371895, "clip_ratio/region_mean": 0.005747126415371895, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 33.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.0974574089050293, "kl": 0.06714414805173874, "learning_rate": 2.7303333333333333e-06, "loss": 0.1271, "num_tokens": 543596.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 33.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.010035802610218525, "kl": 0.00916808657348156, "learning_rate": 2.73e-06, "loss": 0.0005, "num_tokens": 543868.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.028103139251470566, "kl": 0.002121154509950429, "learning_rate": 2.7296666666666665e-06, "loss": 0.0001, "num_tokens": 544145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 33.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028382311575114727, "kl": 0.00012265145778656006, "learning_rate": 2.7293333333333333e-06, "loss": 0.0, "num_tokens": 544365.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 33.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.12235185503959656, "kl": 0.016257425770163536, "learning_rate": 2.729e-06, "loss": 0.0008, "num_tokens": 544658.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 33.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 10.487502098083496, "kl": 0.06302844732999802, "learning_rate": 2.728666666666667e-06, "loss": 0.1364, "num_tokens": 544920.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 33.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 3.118229866027832, "kl": 0.045467047952115536, "learning_rate": 2.7283333333333336e-06, "loss": 0.0879, "num_tokens": 545257.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.08995601534843445, "kl": 0.012283294927328825, "learning_rate": 2.728e-06, "loss": 0.0006, "num_tokens": 545531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 33.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.5635440945625305, "kl": 0.06689948117127642, "learning_rate": 2.7276666666666668e-06, "loss": 0.003, "num_tokens": 545834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.04382326081395149, "kl": 0.003987176809459925, "learning_rate": 2.7273333333333335e-06, "loss": 0.0002, "num_tokens": 546130.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 82.75, "completions/mean_terminated_length": 25.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 33.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.8575448989868164, "kl": 0.017958277836441994, "learning_rate": 2.7270000000000003e-06, "loss": 0.3982, "num_tokens": 546689.0, "reward": 3.799999952316284, "reward_std": 5.588082790374756, "rewards/reward_combined/mean": 3.799999952316284, "rewards/reward_combined/std": 5.588082313537598, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 33.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.1541658639907837, "kl": 0.020028742030262947, "learning_rate": 2.7266666666666667e-06, "loss": 0.001, "num_tokens": 547001.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.15733495354652405, "kl": 0.011172421742230654, "learning_rate": 2.7263333333333335e-06, "loss": 0.0006, "num_tokens": 547264.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 33.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 7.272762298583984, "kl": 0.038841452449560165, "learning_rate": 2.726e-06, "loss": 0.3965, "num_tokens": 547563.0, "reward": 3.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 3.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 33.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787581741809845, "kl": 0.06607088446617126, "learning_rate": 2.7256666666666666e-06, "loss": 0.0032, "num_tokens": 547904.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.037928588688373566, "kl": 0.006824996671639383, "learning_rate": 2.7253333333333334e-06, "loss": 0.0004, "num_tokens": 548174.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 33.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.5614644885063171, "kl": 0.058784357039257884, "learning_rate": 2.725e-06, "loss": 0.0031, "num_tokens": 548464.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 33.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.05880916118621826, "kl": 0.0033947378396987915, "learning_rate": 2.724666666666667e-06, "loss": 0.0002, "num_tokens": 548736.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 33.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 3.120548963546753, "kl": 0.13346320390701294, "learning_rate": 2.7243333333333333e-06, "loss": -0.0513, "num_tokens": 549037.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 33.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.08798273652791977, "kl": 0.01510192733258009, "learning_rate": 2.724e-06, "loss": 0.0007, "num_tokens": 549368.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 33.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.018094424158334732, "kl": 0.0017211210215464234, "learning_rate": 2.7236666666666665e-06, "loss": 0.0001, "num_tokens": 549603.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 33.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.009208236820995808, "kl": 0.00010989755264745327, "learning_rate": 2.7233333333333337e-06, "loss": 0.0, "num_tokens": 549873.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 33.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.026512209326028824, "kl": 0.0014691509422846138, "learning_rate": 2.723e-06, "loss": 0.0001, "num_tokens": 550135.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 33.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.26817670464515686, "kl": 0.04431943129748106, "learning_rate": 2.722666666666667e-06, "loss": 0.0022, "num_tokens": 550435.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 33.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06412789970636368, "kl": 0.004469543811865151, "learning_rate": 2.7223333333333336e-06, "loss": 0.0002, "num_tokens": 550756.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007462686393409967, "clip_ratio/low_min": 0.007462686393409967, "clip_ratio/region_mean": 0.007462686393409967, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 33.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.906788349151611, "kl": 0.07722705788910389, "learning_rate": 2.722e-06, "loss": 0.0279, "num_tokens": 551104.0, "reward": 3.625, "reward_std": 2.75, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 2.75, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 34.0, "frac_reward_zero_std": 0.0, "grad_norm": 6.915585517883301, "kl": 0.06409088708460331, "learning_rate": 2.7216666666666667e-06, "loss": 0.1676, "num_tokens": 551452.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 34.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.027719201520085335, "kl": 0.0017488243756815791, "learning_rate": 2.7213333333333335e-06, "loss": 0.0001, "num_tokens": 551730.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 34.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05981970950961113, "kl": 0.0038055373588576913, "learning_rate": 2.7210000000000003e-06, "loss": 0.0002, "num_tokens": 552036.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 34.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 9.08851432800293, "kl": 0.13051492274098564, "learning_rate": 2.7206666666666667e-06, "loss": -0.0264, "num_tokens": 552255.0, "reward": 2.375, "reward_std": 1.8874585628509521, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.8874585628509521, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 34.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 6.732242584228516, "kl": 0.01689472608268261, "learning_rate": 2.7203333333333334e-06, "loss": 0.1354, "num_tokens": 552505.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 34.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.6987879276275635, "kl": 0.031687792390584946, "learning_rate": 2.72e-06, "loss": 0.1352, "num_tokens": 552854.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 34.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.02235507033765316, "kl": 0.0011891061440110207, "learning_rate": 2.7196666666666666e-06, "loss": 0.0001, "num_tokens": 553182.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 34.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.09800175577402115, "kl": 0.007735051680356264, "learning_rate": 2.7193333333333334e-06, "loss": 0.0004, "num_tokens": 553473.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.05617184564471245, "kl": 0.008804846089333296, "learning_rate": 2.719e-06, "loss": 0.0004, "num_tokens": 553757.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007352941203862429, "clip_ratio/low_min": 0.007352941203862429, "clip_ratio/region_mean": 0.007352941203862429, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 34.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 3.230052947998047, "kl": 0.012352522229775786, "learning_rate": 2.718666666666667e-06, "loss": -0.0113, "num_tokens": 554111.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.1942746937274933, "kl": 0.03265456482768059, "learning_rate": 2.7183333333333333e-06, "loss": 0.0016, "num_tokens": 554397.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 34.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.028727499768137932, "kl": 0.002476705703884363, "learning_rate": 2.718e-06, "loss": 0.0001, "num_tokens": 554631.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 34.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.02679610252380371, "kl": 0.0014699590974487364, "learning_rate": 2.7176666666666664e-06, "loss": 0.0001, "num_tokens": 554893.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 34.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.2361757904291153, "kl": 0.00960657000541687, "learning_rate": 2.7173333333333336e-06, "loss": 0.0006, "num_tokens": 555109.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1849 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.014705882407724857, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014705882407724857, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 34.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 13.724506378173828, "kl": 0.8115830812603235, "learning_rate": 2.717e-06, "loss": 0.0364, "num_tokens": 555486.0, "reward": 2.0, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 1.7320507764816284, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 34.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.024522658437490463, "kl": 0.0023338720202445984, "learning_rate": 2.7166666666666668e-06, "loss": 0.0001, "num_tokens": 555696.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 34.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1523863673210144, "kl": 0.0074703507125377655, "learning_rate": 2.7163333333333336e-06, "loss": 0.0004, "num_tokens": 555956.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 34.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.11143297702074051, "kl": 0.01677517336793244, "learning_rate": 2.716e-06, "loss": 0.0009, "num_tokens": 556228.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 34.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 5.107197284698486, "kl": 0.28561924397945404, "learning_rate": 2.7156666666666667e-06, "loss": -0.019, "num_tokens": 556530.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 34.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0416756235063076, "kl": 0.007633190951310098, "learning_rate": 2.7153333333333335e-06, "loss": 0.0004, "num_tokens": 556800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 34.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.1109652891755104, "kl": 0.022345018573105335, "learning_rate": 2.7150000000000003e-06, "loss": 0.0011, "num_tokens": 557132.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 34.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.20400479435920715, "kl": 0.06068544089794159, "learning_rate": 2.7146666666666666e-06, "loss": 0.003, "num_tokens": 557432.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 34.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.060002490878105164, "kl": 0.007621090626344085, "learning_rate": 2.7143333333333334e-06, "loss": 0.0004, "num_tokens": 557704.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 34.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 6.0465216636657715, "kl": 0.04501319723203778, "learning_rate": 2.7139999999999998e-06, "loss": 0.2299, "num_tokens": 557996.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 34.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.043118901550769806, "kl": 0.0008434891351498663, "learning_rate": 2.7136666666666665e-06, "loss": 0.0, "num_tokens": 558252.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 34.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.002917917910963297, "kl": 3.408889097045176e-05, "learning_rate": 2.7133333333333338e-06, "loss": 0.0, "num_tokens": 558524.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 34.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.8276538848876953, "kl": 0.10735499858856201, "learning_rate": 2.713e-06, "loss": 0.1364, "num_tokens": 558877.0, "reward": 5.375, "reward_std": 4.25, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 4.25, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 34.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.027922697365283966, "kl": 0.0015048664063215256, "learning_rate": 2.712666666666667e-06, "loss": 0.0001, "num_tokens": 559189.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 34.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.1018409729003906, "kl": 0.08543189987540245, "learning_rate": 2.7123333333333333e-06, "loss": 0.0513, "num_tokens": 559504.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 34.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.1569334864616394, "kl": 0.015963513404130936, "learning_rate": 2.712e-06, "loss": 0.0008, "num_tokens": 559794.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.4369741380214691, "kl": 0.08534810319542885, "learning_rate": 2.711666666666667e-06, "loss": 0.0043, "num_tokens": 560063.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 34.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0688805803656578, "kl": 0.04286644235253334, "learning_rate": 2.7113333333333336e-06, "loss": 0.0021, "num_tokens": 560391.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.010634365491569042, "kl": 0.009167622774839401, "learning_rate": 2.711e-06, "loss": 0.0005, "num_tokens": 560663.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 34.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.007116298656910658, "kl": 0.015543228946626186, "learning_rate": 2.7106666666666667e-06, "loss": 0.0008, "num_tokens": 560923.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 34.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.08665429055690765, "kl": 0.04061730671674013, "learning_rate": 2.7103333333333335e-06, "loss": 0.0021, "num_tokens": 561221.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 34.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.05451120436191559, "kl": 0.1576053649187088, "learning_rate": 2.71e-06, "loss": 0.0079, "num_tokens": 561532.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 34.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.07285292446613312, "kl": 0.006827495992183685, "learning_rate": 2.7096666666666667e-06, "loss": 0.0003, "num_tokens": 561748.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 34.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.004154373425990343, "kl": 0.00019634515047073364, "learning_rate": 2.7093333333333335e-06, "loss": 0.0, "num_tokens": 561968.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 34.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.073772668838501, "kl": 0.03586939349770546, "learning_rate": 2.7090000000000002e-06, "loss": 0.0533, "num_tokens": 562386.0, "reward": 2.875, "reward_std": 0.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 0.25, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 34.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.00695054093375802, "kl": 0.0025257617235183716, "learning_rate": 2.7086666666666666e-06, "loss": 0.0001, "num_tokens": 562622.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 34.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03961246460676193, "kl": 0.003956240834668279, "learning_rate": 2.7083333333333334e-06, "loss": 0.0002, "num_tokens": 562918.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 34.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017291797557845712, "kl": 0.001061158487573266, "learning_rate": 2.7079999999999997e-06, "loss": 0.0001, "num_tokens": 563198.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 34.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.2501739263534546, "kl": 0.037852637469768524, "learning_rate": 2.707666666666667e-06, "loss": 0.0019, "num_tokens": 563519.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 34.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1451413482427597, "kl": 0.01997746340930462, "learning_rate": 2.7073333333333337e-06, "loss": 0.001, "num_tokens": 563783.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 34.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.04497898742556572, "kl": 0.0004963263927493244, "learning_rate": 2.707e-06, "loss": 0.0, "num_tokens": 563996.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.04191657528281212, "kl": 0.004467744147405028, "learning_rate": 2.706666666666667e-06, "loss": 0.0002, "num_tokens": 564286.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.31732162833213806, "kl": 0.026654242421500385, "learning_rate": 2.7063333333333332e-06, "loss": 0.0014, "num_tokens": 564586.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 34.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.8125565648078918, "kl": 0.08514676988124847, "learning_rate": 2.706e-06, "loss": 0.0042, "num_tokens": 564920.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.11653347313404083, "kl": 0.006543122231960297, "learning_rate": 2.705666666666667e-06, "loss": 0.0003, "num_tokens": 565186.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.048593275249004364, "kl": 0.0035750133683905005, "learning_rate": 2.7053333333333336e-06, "loss": 0.0002, "num_tokens": 565470.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 34.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.16356879472732544, "kl": 0.030564725399017334, "learning_rate": 2.705e-06, "loss": 0.0015, "num_tokens": 565766.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 88.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 88.5, "completions/mean_terminated_length": 32.66666793823242, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 34.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.4624722003936768, "kl": 0.0702102892100811, "learning_rate": 2.7046666666666667e-06, "loss": 0.3952, "num_tokens": 566336.0, "reward": 1.75, "reward_std": 3.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 3.5, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 34.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02429959364235401, "kl": 0.09237717092037201, "learning_rate": 2.7043333333333335e-06, "loss": 0.0046, "num_tokens": 566702.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 34.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04286166653037071, "kl": 0.003630565945059061, "learning_rate": 2.704e-06, "loss": 0.0002, "num_tokens": 567025.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 35.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.11360520124435425, "kl": 0.05920999124646187, "learning_rate": 2.7036666666666666e-06, "loss": 0.0031, "num_tokens": 567334.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 35.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05320507660508156, "kl": 0.002864798763766885, "learning_rate": 2.7033333333333334e-06, "loss": 0.0001, "num_tokens": 567656.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.020746750757098198, "kl": 0.0024244025407824665, "learning_rate": 2.703e-06, "loss": 0.0001, "num_tokens": 567952.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 35.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.014514111913740635, "kl": 0.0015230292920023203, "learning_rate": 2.7026666666666666e-06, "loss": 0.0001, "num_tokens": 568236.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 35.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.017323600128293037, "kl": 0.0011899081291630864, "learning_rate": 2.7023333333333334e-06, "loss": 0.0001, "num_tokens": 568498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 35.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.1588435173034668, "kl": 0.010516722686588764, "learning_rate": 2.7019999999999997e-06, "loss": 0.0005, "num_tokens": 568800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 35.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.33706891536712646, "kl": 0.029557883739471436, "learning_rate": 2.701666666666667e-06, "loss": 0.0014, "num_tokens": 569035.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 35.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962471902370453, "kl": 0.04810686968266964, "learning_rate": 2.7013333333333337e-06, "loss": 0.0024, "num_tokens": 569303.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 35.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.039578065276145935, "kl": 0.0042112350929528475, "learning_rate": 2.701e-06, "loss": 0.0002, "num_tokens": 569592.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.02448314242064953, "kl": 0.0009404495358467102, "learning_rate": 2.700666666666667e-06, "loss": 0.0, "num_tokens": 569852.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.007747381925582886, "kl": 0.003932915162295103, "learning_rate": 2.700333333333333e-06, "loss": 0.0002, "num_tokens": 570120.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 35.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.33407706022262573, "kl": 0.02135928813368082, "learning_rate": 2.7e-06, "loss": 0.0011, "num_tokens": 570389.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 35.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.001607077312655747, "kl": 0.001033989479765296, "learning_rate": 2.6996666666666668e-06, "loss": 0.0001, "num_tokens": 570669.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 35.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.026151256635785103, "kl": 0.001922984141856432, "learning_rate": 2.6993333333333335e-06, "loss": 0.0001, "num_tokens": 570949.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 35.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030562663450837135, "kl": 0.0001345425844192505, "learning_rate": 2.699e-06, "loss": 0.0, "num_tokens": 571169.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 35.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.04828093945980072, "kl": 0.0015800580149516463, "learning_rate": 2.6986666666666667e-06, "loss": 0.0001, "num_tokens": 571439.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 35.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007243466097861528, "kl": 0.002480931580066681, "learning_rate": 2.6983333333333335e-06, "loss": 0.0001, "num_tokens": 571675.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 35.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791655629873276, "kl": 0.011090089567005634, "learning_rate": 2.698e-06, "loss": 0.0006, "num_tokens": 572000.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 35.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.04971334710717201, "kl": 0.008720860816538334, "learning_rate": 2.697666666666667e-06, "loss": 0.0004, "num_tokens": 572348.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.026712121441960335, "kl": 0.001853201538324356, "learning_rate": 2.6973333333333334e-06, "loss": 0.0001, "num_tokens": 572592.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 35.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 2.464919328689575, "kl": 0.044607602059841156, "learning_rate": 2.697e-06, "loss": -0.0065, "num_tokens": 573010.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 35.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0087440125644207, "kl": 0.0016347132623195648, "learning_rate": 2.6966666666666665e-06, "loss": 0.0001, "num_tokens": 573322.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 35.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.6041319966316223, "kl": 0.046095360070466995, "learning_rate": 2.6963333333333333e-06, "loss": 0.0025, "num_tokens": 573543.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 35.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.12174998968839645, "kl": 0.05907886102795601, "learning_rate": 2.696e-06, "loss": 0.003, "num_tokens": 573875.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 35.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.1315929889678955, "kl": 0.03039420396089554, "learning_rate": 2.695666666666667e-06, "loss": 0.0101, "num_tokens": 574226.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 35.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.029044976457953453, "kl": 0.0014936476945877075, "learning_rate": 2.6953333333333337e-06, "loss": 0.0001, "num_tokens": 574436.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 35.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0063663870096206665, "kl": 0.0009174628066830337, "learning_rate": 2.695e-06, "loss": 0.0, "num_tokens": 574748.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 35.5, "frac_reward_zero_std": 0.0, "grad_norm": 5.5420122146606445, "kl": 0.10465467721223831, "learning_rate": 2.694666666666667e-06, "loss": -0.0478, "num_tokens": 575083.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 35.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.043420203030109406, "kl": 0.00966458348557353, "learning_rate": 2.694333333333333e-06, "loss": 0.0005, "num_tokens": 575434.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 35.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03351733461022377, "kl": 0.003094971179962158, "learning_rate": 2.694e-06, "loss": 0.0002, "num_tokens": 575646.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 35.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.11436270922422409, "kl": 0.003960305359214544, "learning_rate": 2.6936666666666667e-06, "loss": 0.0002, "num_tokens": 575902.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 35.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.16186094284057617, "kl": 0.04600801132619381, "learning_rate": 2.6933333333333335e-06, "loss": 0.0023, "num_tokens": 576234.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 35.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.6478888988494873, "kl": 0.02407922176644206, "learning_rate": 2.693e-06, "loss": -0.0327, "num_tokens": 576537.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 35.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 4.309637546539307, "kl": 0.052644552662968636, "learning_rate": 2.6926666666666667e-06, "loss": -0.0156, "num_tokens": 576871.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 35.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.12051546573638916, "kl": 0.10289335250854492, "learning_rate": 2.6923333333333334e-06, "loss": 0.0052, "num_tokens": 577236.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 35.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.068525031208992, "kl": 0.003370234277099371, "learning_rate": 2.692e-06, "loss": 0.0002, "num_tokens": 577496.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 35.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.11714453995227814, "kl": 0.0013956725597381592, "learning_rate": 2.691666666666667e-06, "loss": 0.0001, "num_tokens": 577708.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 35.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.273424357175827, "kl": 0.03037982527166605, "learning_rate": 2.6913333333333334e-06, "loss": 0.0016, "num_tokens": 578044.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1927 }, { "clip_ratio/high_max": 0.009803921915590763, "clip_ratio/high_mean": 0.009803921915590763, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009803921915590763, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 35.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.6420745849609375, "kl": 0.3156106173992157, "learning_rate": 2.691e-06, "loss": -0.1286, "num_tokens": 578351.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 35.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.4298832416534424, "kl": 0.04664428532123566, "learning_rate": 2.6906666666666665e-06, "loss": 0.0013, "num_tokens": 578695.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 35.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.519260883331299, "kl": 0.029957876540720463, "learning_rate": 2.6903333333333333e-06, "loss": 0.1924, "num_tokens": 578983.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 35.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.12322692573070526, "kl": 0.024027224630117416, "learning_rate": 2.69e-06, "loss": 0.0012, "num_tokens": 579280.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 35.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.1092136949300766, "kl": 0.047265585511922836, "learning_rate": 2.689666666666667e-06, "loss": 0.002, "num_tokens": 579607.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02424030937254429, "kl": 0.005536033306270838, "learning_rate": 2.6893333333333336e-06, "loss": 0.0003, "num_tokens": 579875.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.863023281097412, "kl": 0.03910275222733617, "learning_rate": 2.689e-06, "loss": 0.0251, "num_tokens": 580149.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 35.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.06410113722085953, "kl": 0.1623278483748436, "learning_rate": 2.6886666666666668e-06, "loss": 0.0081, "num_tokens": 580457.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 35.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 6.260194301605225, "kl": 0.0674049761146307, "learning_rate": 2.688333333333333e-06, "loss": 0.0243, "num_tokens": 580764.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 35.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.06127078831195831, "kl": 0.007338077761232853, "learning_rate": 2.688e-06, "loss": 0.0004, "num_tokens": 581054.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 35.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 5.13819694519043, "kl": 0.21109507232904434, "learning_rate": 2.6876666666666667e-06, "loss": 0.1245, "num_tokens": 581367.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 35.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.005136251449584961, "kl": 0.015850027091801167, "learning_rate": 2.6873333333333335e-06, "loss": 0.0008, "num_tokens": 581627.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 35.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.03257406875491142, "kl": 0.01321916887536645, "learning_rate": 2.6870000000000003e-06, "loss": 0.0007, "num_tokens": 581911.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 35.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.14937995374202728, "kl": 0.04119663592427969, "learning_rate": 2.6866666666666666e-06, "loss": 0.0021, "num_tokens": 582215.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.2690214514732361, "kl": 0.04237298294901848, "learning_rate": 2.6863333333333334e-06, "loss": 0.002, "num_tokens": 582497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.17558544874191284, "kl": 0.018677019514143467, "learning_rate": 2.686e-06, "loss": 0.001, "num_tokens": 582766.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075008501298725605, "kl": 0.0024456456303596497, "learning_rate": 2.685666666666667e-06, "loss": 0.0001, "num_tokens": 582982.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 36.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01434825174510479, "kl": 0.26557381451129913, "learning_rate": 2.6853333333333333e-06, "loss": 0.0133, "num_tokens": 583286.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 36.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.18107795715332, "kl": 0.14554791525006294, "learning_rate": 2.685e-06, "loss": 0.0069, "num_tokens": 583624.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 36.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.032823607325553894, "kl": 0.0010624155402183533, "learning_rate": 2.6846666666666665e-06, "loss": 0.0001, "num_tokens": 583832.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 36.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.031030649319291115, "kl": 0.0019805729389190674, "learning_rate": 2.6843333333333333e-06, "loss": 0.0001, "num_tokens": 584044.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 36.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.004425667691975832, "kl": 0.01601268444210291, "learning_rate": 2.684e-06, "loss": 0.0008, "num_tokens": 584304.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 36.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 3.6444244384765625, "kl": 0.07742475718259811, "learning_rate": 2.683666666666667e-06, "loss": 0.1037, "num_tokens": 584674.0, "reward": 3.375, "reward_std": 0.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 0.25, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.7247812151908875, "kl": 0.08049389312509447, "learning_rate": 2.6833333333333336e-06, "loss": 0.004, "num_tokens": 584958.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.03097403421998024, "kl": 0.013565556146204472, "learning_rate": 2.683e-06, "loss": 0.0007, "num_tokens": 585242.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.04462660476565361, "kl": 0.013474771287292242, "learning_rate": 2.6826666666666668e-06, "loss": 0.0007, "num_tokens": 585514.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 36.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.006568982731550932, "kl": 0.0008408394933212548, "learning_rate": 2.682333333333333e-06, "loss": 0.0, "num_tokens": 585776.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05752872675657272, "kl": 0.0017148196493508294, "learning_rate": 2.6820000000000003e-06, "loss": 0.0001, "num_tokens": 585995.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 36.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.113161101937294, "kl": 0.007638789131306112, "learning_rate": 2.6816666666666667e-06, "loss": 0.0004, "num_tokens": 586324.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 36.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.6874308586120605, "kl": 0.019324714317917824, "learning_rate": 2.6813333333333335e-06, "loss": -0.0336, "num_tokens": 586656.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 36.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.18057064712047577, "kl": 0.029344365932047367, "learning_rate": 2.6810000000000003e-06, "loss": 0.0015, "num_tokens": 586990.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.487419605255127, "kl": 0.4813556857407093, "learning_rate": 2.6806666666666666e-06, "loss": 0.0373, "num_tokens": 587259.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05987457558512688, "kl": 0.002563178539276123, "learning_rate": 2.6803333333333334e-06, "loss": 0.0001, "num_tokens": 587523.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 36.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.10670791566371918, "kl": 0.003754036850295961, "learning_rate": 2.68e-06, "loss": 0.0002, "num_tokens": 587756.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.17386090755462646, "kl": 0.009071653243154287, "learning_rate": 2.679666666666667e-06, "loss": 0.0006, "num_tokens": 587989.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 36.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.7542484402656555, "kl": 0.0691049792803824, "learning_rate": 2.6793333333333333e-06, "loss": 0.0042, "num_tokens": 588305.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.03952301666140556, "kl": 0.00029768794775009155, "learning_rate": 2.679e-06, "loss": 0.0, "num_tokens": 588517.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 36.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.08086838573217392, "kl": 0.003991584060713649, "learning_rate": 2.6786666666666665e-06, "loss": 0.0002, "num_tokens": 588760.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.25, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 36.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.5372467041015625, "kl": 0.04116313345730305, "learning_rate": 2.6783333333333332e-06, "loss": 0.0096, "num_tokens": 589165.0, "reward": 5.0, "reward_std": 5.0, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 5.0, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014552439097315073, "kl": 4.976242780685425e-05, "learning_rate": 2.678e-06, "loss": 0.0, "num_tokens": 589385.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.461563587188721, "kl": 0.012471605325117707, "learning_rate": 2.677666666666667e-06, "loss": 0.0003, "num_tokens": 589657.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 36.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.10504665970802307, "kl": 0.017772881779819727, "learning_rate": 2.6773333333333336e-06, "loss": 0.0009, "num_tokens": 589980.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 36.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.023820899426937103, "kl": 0.09272093325853348, "learning_rate": 2.677e-06, "loss": 0.0046, "num_tokens": 590346.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 36.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011626126943156123, "kl": 0.0012381459819152951, "learning_rate": 2.6766666666666667e-06, "loss": 0.0001, "num_tokens": 590626.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 36.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.071332186460495, "kl": 0.007835924974642694, "learning_rate": 2.676333333333333e-06, "loss": 0.0004, "num_tokens": 590953.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 36.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.003008150262758136, "kl": 0.0004336945712566376, "learning_rate": 2.6760000000000003e-06, "loss": 0.0, "num_tokens": 591213.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 36.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05171915888786316, "kl": 0.009745566640049219, "learning_rate": 2.6756666666666667e-06, "loss": 0.0005, "num_tokens": 591518.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 36.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 7.0136566162109375, "kl": 0.1905686825630255, "learning_rate": 2.6753333333333334e-06, "loss": 0.1166, "num_tokens": 591784.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.009792163968086243, "kl": 0.004565859213471413, "learning_rate": 2.6750000000000002e-06, "loss": 0.0002, "num_tokens": 592052.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 36.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.08396516740322113, "kl": 0.005421443609520793, "learning_rate": 2.6746666666666666e-06, "loss": 0.0003, "num_tokens": 592346.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 36.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.006908867042511702, "kl": 0.0019788509234786034, "learning_rate": 2.6743333333333334e-06, "loss": 0.0001, "num_tokens": 592658.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 36.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.3667496144771576, "kl": 0.11709627509117126, "learning_rate": 2.674e-06, "loss": 0.0055, "num_tokens": 593005.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 36.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.3655175268650055, "kl": 0.08529950305819511, "learning_rate": 2.673666666666667e-06, "loss": 0.0041, "num_tokens": 593320.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 36.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.02950194478034973, "kl": 0.1593296006321907, "learning_rate": 2.6733333333333333e-06, "loss": 0.008, "num_tokens": 593628.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.028065474703907967, "kl": 0.0034218335058540106, "learning_rate": 2.673e-06, "loss": 0.0002, "num_tokens": 593924.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 36.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 5.274560928344727, "kl": 0.10706992074847221, "learning_rate": 2.6726666666666664e-06, "loss": 0.0407, "num_tokens": 594299.0, "reward": 3.0, "reward_std": 3.188521146774292, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 3.188521146774292, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 36.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.009605297818779945, "kl": 0.0004290342330932617, "learning_rate": 2.6723333333333332e-06, "loss": 0.0, "num_tokens": 594571.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 36.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.045539434999227524, "kl": 0.009111804887652397, "learning_rate": 2.6720000000000004e-06, "loss": 0.0005, "num_tokens": 594863.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 36.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.08162450790405273, "kl": 0.00872807833366096, "learning_rate": 2.6716666666666668e-06, "loss": 0.0004, "num_tokens": 595157.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 36.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.051039643585681915, "kl": 0.002916533630923368, "learning_rate": 2.6713333333333336e-06, "loss": 0.0001, "num_tokens": 595435.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 36.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.9727557301521301, "kl": 0.08859988860785961, "learning_rate": 2.671e-06, "loss": 0.0043, "num_tokens": 595730.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.006847368087619543, "kl": 0.01034638099372387, "learning_rate": 2.6706666666666667e-06, "loss": 0.0005, "num_tokens": 596002.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.008849128149449825, "kl": 0.0020621493458747864, "learning_rate": 2.6703333333333335e-06, "loss": 0.0001, "num_tokens": 596238.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 36.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.15213976800441742, "kl": 0.0546210166066885, "learning_rate": 2.6700000000000003e-06, "loss": 0.0027, "num_tokens": 596542.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 36.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.07517503201961517, "kl": 0.010591336991637945, "learning_rate": 2.6696666666666666e-06, "loss": 0.0005, "num_tokens": 596876.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 36.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.36763033270835876, "kl": 0.04920933814719319, "learning_rate": 2.6693333333333334e-06, "loss": 0.0021, "num_tokens": 597157.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 36.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.14691144227981567, "kl": 0.018337004352360964, "learning_rate": 2.669e-06, "loss": 0.0009, "num_tokens": 597440.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 36.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.005945276468992233, "kl": 0.0003785049048019573, "learning_rate": 2.6686666666666666e-06, "loss": 0.0, "num_tokens": 597754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 36.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.0691694021224976, "kl": 0.10933673195540905, "learning_rate": 2.6683333333333333e-06, "loss": 0.0126, "num_tokens": 598159.0, "reward": 1.625, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.6007810831069946, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 36.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.1535937339067459, "kl": 0.05911637470126152, "learning_rate": 2.668e-06, "loss": 0.003, "num_tokens": 598462.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 37.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0974106639623642, "kl": 0.004137328127399087, "learning_rate": 2.667666666666667e-06, "loss": 0.0002, "num_tokens": 598718.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 37.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.367629528045654, "kl": 0.09553771838545799, "learning_rate": 2.6673333333333333e-06, "loss": -0.0078, "num_tokens": 599066.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 37.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.018481021746993065, "kl": 0.004939146805554628, "learning_rate": 2.667e-06, "loss": 0.0002, "num_tokens": 599336.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 37.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.006614015903323889, "kl": 0.0011533379438333213, "learning_rate": 2.6666666666666664e-06, "loss": 0.0001, "num_tokens": 599596.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 37.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.18544217944145203, "kl": 0.03620108962059021, "learning_rate": 2.6663333333333336e-06, "loss": 0.0018, "num_tokens": 599901.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 37.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.01268834713846445, "kl": 0.0005764476954936981, "learning_rate": 2.6660000000000004e-06, "loss": 0.0, "num_tokens": 600161.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 37.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 1.1501883268356323, "kl": 0.02275223797187209, "learning_rate": 2.6656666666666668e-06, "loss": -0.0506, "num_tokens": 600587.0, "reward": 2.049999952316284, "reward_std": 1.4177446365356445, "rewards/reward_combined/mean": 2.049999952316284, "rewards/reward_combined/std": 1.417744755744934, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 37.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.006446678191423416, "kl": 0.00017789999401429668, "learning_rate": 2.6653333333333335e-06, "loss": 0.0, "num_tokens": 600859.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 37.148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 6.847894191741943, "kl": 0.031437797006219625, "learning_rate": 2.665e-06, "loss": 0.1088, "num_tokens": 601134.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 37.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.007883016020059586, "kl": 0.0006027668714523315, "learning_rate": 2.6646666666666667e-06, "loss": 0.0, "num_tokens": 601350.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 37.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.03074074164032936, "kl": 0.0016907327226363122, "learning_rate": 2.6643333333333335e-06, "loss": 0.0001, "num_tokens": 601619.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 37.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07490681856870651, "kl": 0.01436805771663785, "learning_rate": 2.6640000000000002e-06, "loss": 0.0007, "num_tokens": 601910.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 37.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.03064054436981678, "kl": 0.0018362122355028987, "learning_rate": 2.6636666666666666e-06, "loss": 0.0001, "num_tokens": 602192.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 37.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.14514805376529694, "kl": 0.01454864488914609, "learning_rate": 2.6633333333333334e-06, "loss": 0.0008, "num_tokens": 602500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 37.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.6526553630828857, "kl": 0.015236596576869488, "learning_rate": 2.663e-06, "loss": 0.0237, "num_tokens": 602833.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 37.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.005322328768670559, "kl": 0.00038795835280325264, "learning_rate": 2.6626666666666665e-06, "loss": 0.0, "num_tokens": 603145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 37.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.013279959559440613, "kl": 0.0037963627837598324, "learning_rate": 2.6623333333333333e-06, "loss": 0.0002, "num_tokens": 603405.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 37.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.011032840237021446, "kl": 0.002308901399374008, "learning_rate": 2.662e-06, "loss": 0.0001, "num_tokens": 603717.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 37.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.01130884513258934, "kl": 0.26621611416339874, "learning_rate": 2.661666666666667e-06, "loss": 0.0133, "num_tokens": 604021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 37.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.02242274582386017, "kl": 0.0009990260004997253, "learning_rate": 2.6613333333333332e-06, "loss": 0.0, "num_tokens": 604231.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 37.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.055110346525907516, "kl": 0.0056349122896790504, "learning_rate": 2.661e-06, "loss": 0.0003, "num_tokens": 604522.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 37.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.006918244995176792, "kl": 0.010301381349563599, "learning_rate": 2.6606666666666664e-06, "loss": 0.0005, "num_tokens": 604794.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 37.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010429744143038988, "kl": 3.089010715484619e-05, "learning_rate": 2.6603333333333336e-06, "loss": 0.0, "num_tokens": 605014.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 37.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.01997925341129303, "kl": 0.0079887006431818, "learning_rate": 2.6600000000000004e-06, "loss": 0.0004, "num_tokens": 605320.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 37.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03852691873908043, "kl": 0.0002607181668281555, "learning_rate": 2.6596666666666667e-06, "loss": 0.0, "num_tokens": 605532.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 37.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09802347421646118, "kl": 0.006271092686802149, "learning_rate": 2.6593333333333335e-06, "loss": 0.0003, "num_tokens": 605788.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 37.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 11.044208526611328, "kl": 0.015496873296797276, "learning_rate": 2.659e-06, "loss": 0.3791, "num_tokens": 606029.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 37.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.004191862419247627, "kl": 0.01603720895946026, "learning_rate": 2.6586666666666667e-06, "loss": 0.0008, "num_tokens": 606289.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 37.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012551024556159973, "kl": 0.0012371752527542412, "learning_rate": 2.6583333333333334e-06, "loss": 0.0001, "num_tokens": 606569.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 37.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04891206696629524, "kl": 0.012084125075489283, "learning_rate": 2.6580000000000002e-06, "loss": 0.0006, "num_tokens": 606865.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 37.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.988032341003418, "kl": 0.040377695113420486, "learning_rate": 2.6576666666666666e-06, "loss": 0.2248, "num_tokens": 607241.0, "reward": 4.875, "reward_std": 5.25, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 5.25, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 37.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.04607218876481056, "kl": 0.00990409275982529, "learning_rate": 2.6573333333333334e-06, "loss": 0.0006, "num_tokens": 607540.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 37.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.25264519453048706, "kl": 0.07270743325352669, "learning_rate": 2.657e-06, "loss": 0.0036, "num_tokens": 607860.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 37.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.043640125542879105, "kl": 0.0011769604461733252, "learning_rate": 2.6566666666666665e-06, "loss": 0.0001, "num_tokens": 608094.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2031 }, { "clip_ratio/high_max": 0.00909090880304575, "clip_ratio/high_mean": 0.00909090880304575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00909090880304575, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 37.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 3.958291530609131, "kl": 0.06803623959422112, "learning_rate": 2.6563333333333337e-06, "loss": 0.0536, "num_tokens": 608452.0, "reward": 6.625, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.0966243743896484, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 37.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.23634594678878784, "kl": 0.015228984877467155, "learning_rate": 2.656e-06, "loss": 0.0008, "num_tokens": 608785.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 37.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.06777699291706085, "kl": 0.13369496539235115, "learning_rate": 2.655666666666667e-06, "loss": 0.0068, "num_tokens": 609091.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 37.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.30684855580329895, "kl": 0.06815839000046253, "learning_rate": 2.655333333333333e-06, "loss": 0.0034, "num_tokens": 609409.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 37.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.19832812249660492, "kl": 0.024977766908705235, "learning_rate": 2.655e-06, "loss": 0.0013, "num_tokens": 609755.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 37.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.5047805309295654, "kl": 0.06455581076443195, "learning_rate": 2.6546666666666668e-06, "loss": 0.0054, "num_tokens": 610118.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 37.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.523073673248291, "kl": 0.6824191145133227, "learning_rate": 2.6543333333333336e-06, "loss": 0.0371, "num_tokens": 610414.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 37.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.1935129463672638, "kl": 0.0730351060628891, "learning_rate": 2.6540000000000003e-06, "loss": 0.0035, "num_tokens": 610696.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 37.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.021079430356621742, "kl": 0.0008019383531063795, "learning_rate": 2.6536666666666667e-06, "loss": 0.0, "num_tokens": 611013.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 37.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07596131414175034, "kl": 0.0033614374697208405, "learning_rate": 2.6533333333333335e-06, "loss": 0.0002, "num_tokens": 611291.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 37.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 1.9809070825576782, "kl": 0.059764180332422256, "learning_rate": 2.653e-06, "loss": 0.0666, "num_tokens": 611687.0, "reward": 4.25, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 2.1794495582580566, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 37.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.01673242449760437, "kl": 0.0010903941001743078, "learning_rate": 2.6526666666666666e-06, "loss": 0.0001, "num_tokens": 611930.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 37.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 4.7295002937316895, "kl": 0.021687767934054136, "learning_rate": 2.6523333333333334e-06, "loss": 0.3056, "num_tokens": 612232.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 37.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.02881363406777382, "kl": 0.013736420311033726, "learning_rate": 2.652e-06, "loss": 0.0007, "num_tokens": 612516.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 37.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.035456810146570206, "kl": 0.0930885374546051, "learning_rate": 2.6516666666666665e-06, "loss": 0.0047, "num_tokens": 612880.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 37.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.11515820771455765, "kl": 0.024955608882009983, "learning_rate": 2.6513333333333333e-06, "loss": 0.0012, "num_tokens": 613205.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 37.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0065927221439778805, "kl": 0.0003466665802989155, "learning_rate": 2.651e-06, "loss": 0.0, "num_tokens": 613425.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 37.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.033989429473877, "kl": 0.21544765689759515, "learning_rate": 2.6506666666666665e-06, "loss": -0.0519, "num_tokens": 613744.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 37.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.046170637011528015, "kl": 0.001148730458226055, "learning_rate": 2.6503333333333337e-06, "loss": 0.0001, "num_tokens": 614000.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 37.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.253532737493515, "kl": 0.013601384125649929, "learning_rate": 2.65e-06, "loss": 0.0009, "num_tokens": 614243.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 38.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.6520941257476807, "kl": 0.001956335734575987, "learning_rate": 2.649666666666667e-06, "loss": 0.0381, "num_tokens": 614535.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 38.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.49216076731681824, "kl": 0.11486417427659035, "learning_rate": 2.649333333333333e-06, "loss": 0.0057, "num_tokens": 614866.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 38.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.19710887968540192, "kl": 0.020232319831848145, "learning_rate": 2.649e-06, "loss": 0.001, "num_tokens": 615194.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 38.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06992805004119873, "kl": 0.007198390318080783, "learning_rate": 2.6486666666666667e-06, "loss": 0.0004, "num_tokens": 615525.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.14179585874080658, "kl": 0.01871525961905718, "learning_rate": 2.6483333333333335e-06, "loss": 0.001, "num_tokens": 615814.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 38.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 3.338315963745117, "kl": 0.04318897798657417, "learning_rate": 2.6480000000000003e-06, "loss": -0.0093, "num_tokens": 616146.0, "reward": 4.375, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 2.0966243743896484, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 38.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.005700599867850542, "kl": 0.00030153393163345754, "learning_rate": 2.6476666666666667e-06, "loss": 0.0, "num_tokens": 616366.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2058 }, { "clip_ratio/high_max": 0.011111111380159855, "clip_ratio/high_mean": 0.011111111380159855, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 38.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 5.507438659667969, "kl": 0.05070340633392334, "learning_rate": 2.6473333333333335e-06, "loss": -0.0043, "num_tokens": 616674.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.05976412445306778, "kl": 0.004106957232579589, "learning_rate": 2.647e-06, "loss": 0.0002, "num_tokens": 616958.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 38.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 11.302739143371582, "kl": 0.055559821776114404, "learning_rate": 2.6466666666666666e-06, "loss": 0.0901, "num_tokens": 617196.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 38.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.3871672451496124, "kl": 0.05765043757855892, "learning_rate": 2.6463333333333334e-06, "loss": 0.0032, "num_tokens": 617460.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01696948893368244, "kl": 0.001366661163046956, "learning_rate": 2.646e-06, "loss": 0.0001, "num_tokens": 617744.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.002141333185136318, "kl": 8.910894393920898e-05, "learning_rate": 2.645666666666667e-06, "loss": 0.0, "num_tokens": 617964.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 38.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.7232794761657715, "kl": 0.0037858079303987324, "learning_rate": 2.6453333333333333e-06, "loss": 0.1205, "num_tokens": 618240.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 38.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.001858091214671731, "kl": 0.00010813176777446643, "learning_rate": 2.645e-06, "loss": 0.0, "num_tokens": 618508.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.23852196335792542, "kl": 0.05729203671216965, "learning_rate": 2.644666666666667e-06, "loss": 0.0024, "num_tokens": 618825.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 38.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02844509482383728, "kl": 0.0005351453874027357, "learning_rate": 2.6443333333333337e-06, "loss": 0.0, "num_tokens": 619081.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 38.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.04656428098678589, "kl": 0.007615548558533192, "learning_rate": 2.644e-06, "loss": 0.0004, "num_tokens": 619383.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.09220888465642929, "kl": 0.006497216279967688, "learning_rate": 2.643666666666667e-06, "loss": 0.0003, "num_tokens": 619662.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2070 }, { "clip_ratio/high_max": 0.010204081423580647, "clip_ratio/high_mean": 0.010204081423580647, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010204081423580647, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 38.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 4.1496195793151855, "kl": 0.12165561318397522, "learning_rate": 2.643333333333333e-06, "loss": 0.0887, "num_tokens": 620004.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 38.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016272624488919973, "kl": 0.0011427743011154234, "learning_rate": 2.643e-06, "loss": 0.0001, "num_tokens": 620284.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.24287305772304535, "kl": 0.02667728951200843, "learning_rate": 2.6426666666666667e-06, "loss": 0.0013, "num_tokens": 620538.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 38.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.09146798402070999, "kl": 0.002835690975189209, "learning_rate": 2.6423333333333335e-06, "loss": 0.0001, "num_tokens": 620754.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 38.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 6.900708198547363, "kl": 0.08538460358977318, "learning_rate": 2.6420000000000003e-06, "loss": -0.276, "num_tokens": 621040.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 38.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.4542900025844574, "kl": 0.06759600341320038, "learning_rate": 2.6416666666666666e-06, "loss": 0.0033, "num_tokens": 621341.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 38.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.31494176387786865, "kl": 0.064749326556921, "learning_rate": 2.6413333333333334e-06, "loss": 0.0036, "num_tokens": 621631.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.009203692898154259, "kl": 0.009390239603817463, "learning_rate": 2.6409999999999998e-06, "loss": 0.0005, "num_tokens": 621903.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 38.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03841273859143257, "kl": 0.007130143931135535, "learning_rate": 2.640666666666667e-06, "loss": 0.0004, "num_tokens": 622247.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 38.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1198619157075882, "kl": 0.021041665691882372, "learning_rate": 2.6403333333333334e-06, "loss": 0.0011, "num_tokens": 622515.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 38.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03470019996166229, "kl": 0.00427778234006837, "learning_rate": 2.64e-06, "loss": 0.0002, "num_tokens": 622827.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 38.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.27020177245140076, "kl": 0.053211357444524765, "learning_rate": 2.639666666666667e-06, "loss": 0.0027, "num_tokens": 623129.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 38.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 3.43861722946167, "kl": 0.8715322834905237, "learning_rate": 2.6393333333333333e-06, "loss": -0.0466, "num_tokens": 623416.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 38.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.008841919712722301, "kl": 0.0005360543727874756, "learning_rate": 2.639e-06, "loss": 0.0, "num_tokens": 623676.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 38.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.006221949588507414, "kl": 0.0009625107049942017, "learning_rate": 2.638666666666667e-06, "loss": 0.0, "num_tokens": 623920.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 38.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 4.516137599945068, "kl": 0.08168338239192963, "learning_rate": 2.6383333333333336e-06, "loss": 0.1445, "num_tokens": 624205.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 38.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.015516391955316067, "kl": 0.0010561671806499362, "learning_rate": 2.638e-06, "loss": 0.0001, "num_tokens": 624467.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 38.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 2.3098702430725098, "kl": 0.0630001462996006, "learning_rate": 2.6376666666666668e-06, "loss": 0.0585, "num_tokens": 624818.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 38.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.34222495555877686, "kl": 0.038062578067183495, "learning_rate": 2.637333333333333e-06, "loss": 0.0019, "num_tokens": 625114.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 38.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.004281165078282356, "kl": 0.015956650488078594, "learning_rate": 2.637e-06, "loss": 0.0008, "num_tokens": 625374.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 38.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.02747010812163353, "kl": 0.0013428330421447754, "learning_rate": 2.6366666666666667e-06, "loss": 0.0001, "num_tokens": 625582.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 38.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03892965614795685, "kl": 0.0028251956100575626, "learning_rate": 2.6363333333333335e-06, "loss": 0.0002, "num_tokens": 625905.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 38.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01461059134453535, "kl": 0.26530514657497406, "learning_rate": 2.6360000000000003e-06, "loss": 0.0133, "num_tokens": 626209.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 38.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.523496150970459, "kl": 0.09386411216109991, "learning_rate": 2.6356666666666666e-06, "loss": -0.2074, "num_tokens": 626562.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 38.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.259440898895264, "kl": 0.08142408728599548, "learning_rate": 2.6353333333333334e-06, "loss": 0.0902, "num_tokens": 626891.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 38.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.00920241791754961, "kl": 0.03917407616972923, "learning_rate": 2.6349999999999998e-06, "loss": 0.002, "num_tokens": 627296.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 38.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.03610505163669586, "kl": 0.09293317049741745, "learning_rate": 2.634666666666667e-06, "loss": 0.0046, "num_tokens": 627660.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 38.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.07827702164649963, "kl": 0.006866562878713012, "learning_rate": 2.6343333333333333e-06, "loss": 0.0003, "num_tokens": 627964.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.010970977135002613, "kl": 0.002987690269947052, "learning_rate": 2.634e-06, "loss": 0.0001, "num_tokens": 628180.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 38.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.3584320843219757, "kl": 0.19197433441877365, "learning_rate": 2.633666666666667e-06, "loss": 0.0096, "num_tokens": 628490.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2100 }, { "clip_ratio/high_max": 0.05000000074505806, "clip_ratio/high_mean": 0.05000000074505806, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05000000074505806, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 10.96440315246582, "kl": 0.21618781238794327, "learning_rate": 2.6333333333333332e-06, "loss": 0.0969, "num_tokens": 628728.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 38.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.015474081970751286, "kl": 0.004213389940559864, "learning_rate": 2.633e-06, "loss": 0.0002, "num_tokens": 628998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 38.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06896309554576874, "kl": 0.012822144664824009, "learning_rate": 2.632666666666667e-06, "loss": 0.0006, "num_tokens": 629328.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12541289627552032, "kl": 0.002096331096254289, "learning_rate": 2.6323333333333336e-06, "loss": 0.0001, "num_tokens": 629541.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 38.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.007181167136877775, "kl": 0.0016973447054624557, "learning_rate": 2.632e-06, "loss": 0.0001, "num_tokens": 629853.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.034228816628456116, "kl": 0.009527456015348434, "learning_rate": 2.6316666666666667e-06, "loss": 0.0005, "num_tokens": 630142.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 39.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0050581987015903, "kl": 0.0013630688190460205, "learning_rate": 2.631333333333333e-06, "loss": 0.0001, "num_tokens": 630454.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 39.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.28263208270072937, "kl": 0.023534612730145454, "learning_rate": 2.631e-06, "loss": 0.0012, "num_tokens": 630782.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.529536008834839, "kl": 0.04671902675181627, "learning_rate": 2.630666666666667e-06, "loss": 0.2356, "num_tokens": 631111.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 39.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.029026197269558907, "kl": 0.0014323961222544312, "learning_rate": 2.6303333333333334e-06, "loss": 0.0001, "num_tokens": 631346.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 6.125633239746094, "kl": 0.06368220970034599, "learning_rate": 2.6300000000000002e-06, "loss": 0.1448, "num_tokens": 631635.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 5.142145156860352, "kl": 0.048971325159072876, "learning_rate": 2.6296666666666666e-06, "loss": 0.1058, "num_tokens": 631908.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 39.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.45784685015678406, "kl": 0.09902294422499835, "learning_rate": 2.6293333333333334e-06, "loss": 0.003, "num_tokens": 632164.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 39.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.060580141842365265, "kl": 0.009040889330208302, "learning_rate": 2.629e-06, "loss": 0.0004, "num_tokens": 632496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 39.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.1341930478811264, "kl": 0.037658074870705605, "learning_rate": 2.628666666666667e-06, "loss": 0.0021, "num_tokens": 632834.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 39.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 5.046877861022949, "kl": 0.3204140365123749, "learning_rate": 2.6283333333333333e-06, "loss": 0.0042, "num_tokens": 633137.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 39.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1252264678478241, "kl": 0.015165239572525024, "learning_rate": 2.628e-06, "loss": 0.0008, "num_tokens": 633353.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 39.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.028630902990698814, "kl": 0.0017961161211133003, "learning_rate": 2.627666666666667e-06, "loss": 0.0001, "num_tokens": 633665.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 39.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.004429290071129799, "kl": 0.015935112722218037, "learning_rate": 2.6273333333333332e-06, "loss": 0.0008, "num_tokens": 633925.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06858514994382858, "kl": 0.03680132422596216, "learning_rate": 2.627e-06, "loss": 0.0018, "num_tokens": 634225.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 39.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.05428372323513031, "kl": 0.005785493645817041, "learning_rate": 2.6266666666666668e-06, "loss": 0.0003, "num_tokens": 634481.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007947173900902271, "kl": 0.0007086461409926414, "learning_rate": 2.6263333333333336e-06, "loss": 0.0, "num_tokens": 634799.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.01637076400220394, "kl": 0.0038555373903363943, "learning_rate": 2.626e-06, "loss": 0.0002, "num_tokens": 635067.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 39.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0065318383276462555, "kl": 0.0008631125092506409, "learning_rate": 2.6256666666666667e-06, "loss": 0.0, "num_tokens": 635311.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 39.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.20261795818805695, "kl": 0.048456584103405476, "learning_rate": 2.625333333333333e-06, "loss": 0.0026, "num_tokens": 635632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 39.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 5.527511119842529, "kl": 0.060799007973400876, "learning_rate": 2.6250000000000003e-06, "loss": -0.0385, "num_tokens": 635910.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 39.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 4.544265270233154, "kl": 0.20098943263292313, "learning_rate": 2.624666666666667e-06, "loss": -0.0376, "num_tokens": 636244.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 39.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 5.801536560058594, "kl": 0.10604314506053925, "learning_rate": 2.6243333333333334e-06, "loss": -0.0168, "num_tokens": 636555.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.05629903823137283, "kl": 0.006447185412980616, "learning_rate": 2.624e-06, "loss": 0.0003, "num_tokens": 636837.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11607575416564941, "kl": 0.00840452453121543, "learning_rate": 2.6236666666666666e-06, "loss": 0.0004, "num_tokens": 637133.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.019287265837192535, "kl": 0.010870505589991808, "learning_rate": 2.6233333333333333e-06, "loss": 0.0005, "num_tokens": 637401.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 39.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.026004992425441742, "kl": 0.0011187029886059463, "learning_rate": 2.623e-06, "loss": 0.0001, "num_tokens": 637663.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 39.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04478120431303978, "kl": 0.0016564875841140747, "learning_rate": 2.622666666666667e-06, "loss": 0.0001, "num_tokens": 637923.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 39.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08575080335140228, "kl": 0.005539353413041681, "learning_rate": 2.6223333333333333e-06, "loss": 0.0002, "num_tokens": 638199.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 39.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.4723203182220459, "kl": 0.07787950336933136, "learning_rate": 2.622e-06, "loss": 0.0039, "num_tokens": 638419.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 5.368828773498535, "kl": 0.15016889572143555, "learning_rate": 2.621666666666667e-06, "loss": 0.1294, "num_tokens": 638740.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 39.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.004681735765188932, "kl": 0.04500012286007404, "learning_rate": 2.621333333333333e-06, "loss": 0.0023, "num_tokens": 639144.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.04991491138935089, "kl": 0.01225105207413435, "learning_rate": 2.621e-06, "loss": 0.0006, "num_tokens": 639432.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 39.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.21720410883426666, "kl": 0.0040453895926475525, "learning_rate": 2.6206666666666668e-06, "loss": 0.0002, "num_tokens": 639642.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.430679053068161, "kl": 0.05039902962744236, "learning_rate": 2.6203333333333335e-06, "loss": 0.0025, "num_tokens": 639932.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 39.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.2291373908519745, "kl": 0.14086488634347916, "learning_rate": 2.62e-06, "loss": 0.0068, "num_tokens": 640253.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.20720158517360687, "kl": 0.06418174505233765, "learning_rate": 2.6196666666666667e-06, "loss": 0.0032, "num_tokens": 640578.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 39.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.08042503893375397, "kl": 0.010246471967548132, "learning_rate": 2.619333333333333e-06, "loss": 0.0005, "num_tokens": 640902.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.14660649001598358, "kl": 0.028331005945801735, "learning_rate": 2.6190000000000003e-06, "loss": 0.0014, "num_tokens": 641162.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 39.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.006078194361180067, "kl": 0.01085857953876257, "learning_rate": 2.618666666666667e-06, "loss": 0.0005, "num_tokens": 641434.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 39.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.1789475679397583, "kl": 0.40612654387950897, "learning_rate": 2.6183333333333334e-06, "loss": 0.0053, "num_tokens": 641729.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 39.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.3460330367088318, "kl": 0.03149532899260521, "learning_rate": 2.618e-06, "loss": 0.0016, "num_tokens": 642011.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.27375251054763794, "kl": 0.045461490750312805, "learning_rate": 2.6176666666666665e-06, "loss": 0.0022, "num_tokens": 642291.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 39.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02751186490058899, "kl": 0.09434760734438896, "learning_rate": 2.6173333333333333e-06, "loss": 0.0047, "num_tokens": 642655.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.853020668029785, "kl": 0.04612906463444233, "learning_rate": 2.617e-06, "loss": 0.0276, "num_tokens": 642951.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 39.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 5.4463019371032715, "kl": 0.05522099696099758, "learning_rate": 2.616666666666667e-06, "loss": 0.0967, "num_tokens": 643292.0, "reward": 3.75, "reward_std": 2.723355770111084, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 2.723355770111084, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 39.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.033128779381513596, "kl": 0.00267578661441803, "learning_rate": 2.6163333333333332e-06, "loss": 0.0001, "num_tokens": 643504.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 39.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.040096163749694824, "kl": 0.0011983886361122131, "learning_rate": 2.616e-06, "loss": 0.0001, "num_tokens": 643772.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 39.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.12236732244491577, "kl": 0.02696285117417574, "learning_rate": 2.615666666666667e-06, "loss": 0.0013, "num_tokens": 644072.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 39.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.08776862174272537, "kl": 0.012718722689896822, "learning_rate": 2.615333333333333e-06, "loss": 0.0006, "num_tokens": 644372.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 39.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.022265596315264702, "kl": 0.0008883476257324219, "learning_rate": 2.6150000000000004e-06, "loss": 0.0, "num_tokens": 644584.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.3911826610565186, "kl": 0.07657848484814167, "learning_rate": 2.6146666666666667e-06, "loss": 0.0198, "num_tokens": 644905.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 39.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.018378296867012978, "kl": 0.001259174954611808, "learning_rate": 2.6143333333333335e-06, "loss": 0.0001, "num_tokens": 645122.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 39.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.019308192655444145, "kl": 0.0002418264775769785, "learning_rate": 2.614e-06, "loss": 0.0, "num_tokens": 645378.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.010439989157021046, "kl": 0.0017805024981498718, "learning_rate": 2.6136666666666667e-06, "loss": 0.0001, "num_tokens": 645614.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 40.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.678936958312988, "kl": 0.11380278319120407, "learning_rate": 2.6133333333333334e-06, "loss": 0.0047, "num_tokens": 645880.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 40.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.006137826945632696, "kl": 0.0008134424861054868, "learning_rate": 2.6130000000000002e-06, "loss": 0.0, "num_tokens": 646140.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 40.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.10522975027561188, "kl": 0.05451890267431736, "learning_rate": 2.612666666666667e-06, "loss": 0.0027, "num_tokens": 646494.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 40.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 4.383107662200928, "kl": 0.07697451114654541, "learning_rate": 2.6123333333333334e-06, "loss": 0.0166, "num_tokens": 646847.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 40.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.012628552503883839, "kl": 0.001994713209569454, "learning_rate": 2.612e-06, "loss": 0.0001, "num_tokens": 647159.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 40.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.11814142763614655, "kl": 0.041642939671874046, "learning_rate": 2.6116666666666665e-06, "loss": 0.0021, "num_tokens": 647484.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014402723172679543, "kl": 5.1952898502349854e-05, "learning_rate": 2.6113333333333333e-06, "loss": 0.0, "num_tokens": 647704.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 40.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.08754493296146393, "kl": 0.0054903654381632805, "learning_rate": 2.611e-06, "loss": 0.0003, "num_tokens": 648000.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 40.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 1.6392521858215332, "kl": 0.02249107463285327, "learning_rate": 2.610666666666667e-06, "loss": 0.0868, "num_tokens": 648417.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 40.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.6833618879318237, "kl": 0.08711736090481281, "learning_rate": 2.6103333333333332e-06, "loss": 0.0041, "num_tokens": 648716.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 40.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06694292277097702, "kl": 0.006061729276552796, "learning_rate": 2.61e-06, "loss": 0.0003, "num_tokens": 649044.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 40.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.011420521885156631, "kl": 0.000582220294745639, "learning_rate": 2.6096666666666668e-06, "loss": 0.0, "num_tokens": 649279.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 40.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.054911404848098755, "kl": 0.01224487042054534, "learning_rate": 2.609333333333333e-06, "loss": 0.0006, "num_tokens": 649547.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 40.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.10499117523431778, "kl": 0.011184069328010082, "learning_rate": 2.6090000000000003e-06, "loss": 0.0006, "num_tokens": 649807.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 40.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.05834371969103813, "kl": 0.012211961671710014, "learning_rate": 2.6086666666666667e-06, "loss": 0.0006, "num_tokens": 650139.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.3531286418437958, "kl": 0.020087300217710435, "learning_rate": 2.6083333333333335e-06, "loss": 0.0012, "num_tokens": 650360.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 40.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.3746749460697174, "kl": 0.0676138773560524, "learning_rate": 2.608e-06, "loss": 0.0037, "num_tokens": 650671.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.19808438420295715, "kl": 0.05435522459447384, "learning_rate": 2.6076666666666666e-06, "loss": 0.0027, "num_tokens": 650963.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 40.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.08828325569629669, "kl": 0.015800793655216694, "learning_rate": 2.6073333333333334e-06, "loss": 0.0008, "num_tokens": 651251.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 40.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.024463383480906487, "kl": 0.09496597200632095, "learning_rate": 2.607e-06, "loss": 0.0047, "num_tokens": 651615.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 40.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.00997218769043684, "kl": 0.26684069633483887, "learning_rate": 2.606666666666667e-06, "loss": 0.0133, "num_tokens": 651919.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 40.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.1911213994026184, "kl": 0.011752256192266941, "learning_rate": 2.6063333333333333e-06, "loss": 0.0006, "num_tokens": 652243.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 40.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 4.499086856842041, "kl": 0.034339187666773796, "learning_rate": 2.606e-06, "loss": 0.0062, "num_tokens": 652563.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 40.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.824836730957031, "kl": 0.003653585212305188, "learning_rate": 2.6056666666666665e-06, "loss": 0.0385, "num_tokens": 652837.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06412987411022186, "kl": 0.007954416330903769, "learning_rate": 2.6053333333333333e-06, "loss": 0.0004, "num_tokens": 653119.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 9.557842254638672, "kl": 0.029613006860017776, "learning_rate": 2.605e-06, "loss": 0.3348, "num_tokens": 653352.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 40.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.2712440490722656, "kl": 0.10826662555336952, "learning_rate": 2.604666666666667e-06, "loss": 0.0055, "num_tokens": 653681.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009522831067442894, "kl": 0.0020661503076553345, "learning_rate": 2.604333333333333e-06, "loss": 0.0001, "num_tokens": 653917.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 40.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09133666008710861, "kl": 0.09584910795092583, "learning_rate": 2.604e-06, "loss": 0.0048, "num_tokens": 654214.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.006647504400461912, "kl": 0.010639351326972246, "learning_rate": 2.6036666666666668e-06, "loss": 0.0005, "num_tokens": 654486.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 40.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0802210196852684, "kl": 0.0038588105235248804, "learning_rate": 2.6033333333333335e-06, "loss": 0.0002, "num_tokens": 654729.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 40.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.5295495986938477, "kl": 0.013125494122505188, "learning_rate": 2.6030000000000003e-06, "loss": 0.0336, "num_tokens": 655063.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 40.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.11449353396892548, "kl": 0.02009360957890749, "learning_rate": 2.6026666666666667e-06, "loss": 0.001, "num_tokens": 655323.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 40.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 5.113491058349609, "kl": 0.15874231606721878, "learning_rate": 2.6023333333333335e-06, "loss": 0.2664, "num_tokens": 655661.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 40.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.056525591760873795, "kl": 0.0020536035299301147, "learning_rate": 2.602e-06, "loss": 0.0001, "num_tokens": 655877.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 40.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 2.9637739658355713, "kl": 0.048197727650403976, "learning_rate": 2.6016666666666666e-06, "loss": 0.1346, "num_tokens": 656245.0, "reward": 2.875, "reward_std": 4.346933841705322, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 4.346933841705322, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 40.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013585948618128896, "kl": 0.0011265341890975833, "learning_rate": 2.6013333333333334e-06, "loss": 0.0001, "num_tokens": 656525.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 40.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.056193895637989044, "kl": 0.008010640507563949, "learning_rate": 2.601e-06, "loss": 0.0004, "num_tokens": 656798.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.02086481638252735, "kl": 0.0006685405969619751, "learning_rate": 2.600666666666667e-06, "loss": 0.0, "num_tokens": 657010.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 40.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01918591372668743, "kl": 0.0019661039113998413, "learning_rate": 2.6003333333333333e-06, "loss": 0.0001, "num_tokens": 657270.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 40.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07656833529472351, "kl": 0.002072945237159729, "learning_rate": 2.6e-06, "loss": 0.0001, "num_tokens": 657480.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 40.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.556825876235962, "kl": 0.07838684506714344, "learning_rate": 2.5996666666666665e-06, "loss": 0.1594, "num_tokens": 657817.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 40.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05457224324345589, "kl": 0.007320788223296404, "learning_rate": 2.5993333333333337e-06, "loss": 0.0004, "num_tokens": 658089.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 40.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.10989849269390106, "kl": 0.007729566190391779, "learning_rate": 2.599e-06, "loss": 0.0004, "num_tokens": 658415.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 40.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.11126889288425446, "kl": 0.020552767906337976, "learning_rate": 2.598666666666667e-06, "loss": 0.001, "num_tokens": 658712.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 40.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.00414922134950757, "kl": 0.016017152927815914, "learning_rate": 2.5983333333333336e-06, "loss": 0.0008, "num_tokens": 658972.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 40.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.10008968412876129, "kl": 0.005681299197021872, "learning_rate": 2.598e-06, "loss": 0.0002, "num_tokens": 659248.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 40.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.26465556025505066, "kl": 0.008169974316842854, "learning_rate": 2.5976666666666667e-06, "loss": 0.0004, "num_tokens": 659504.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 40.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.017296859994530678, "kl": 0.0007104054093360901, "learning_rate": 2.5973333333333335e-06, "loss": 0.0, "num_tokens": 659764.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 40.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.03946181386709213, "kl": 0.0026910784072242677, "learning_rate": 2.5970000000000003e-06, "loss": 0.0001, "num_tokens": 660076.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.013506383635103703, "kl": 0.0015528385993093252, "learning_rate": 2.5966666666666667e-06, "loss": 0.0001, "num_tokens": 660360.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.052044857293367386, "kl": 0.008261523442342877, "learning_rate": 2.5963333333333334e-06, "loss": 0.0004, "num_tokens": 660660.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 40.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.16538044810295105, "kl": 0.02766789309680462, "learning_rate": 2.596e-06, "loss": 0.0014, "num_tokens": 660954.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 41.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.031174039468169212, "kl": 0.006419439101591706, "learning_rate": 2.5956666666666666e-06, "loss": 0.0003, "num_tokens": 661255.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009693685919046402, "kl": 0.002014942467212677, "learning_rate": 2.5953333333333334e-06, "loss": 0.0001, "num_tokens": 661491.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 41.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03078620508313179, "kl": 0.001665353775024414, "learning_rate": 2.595e-06, "loss": 0.0001, "num_tokens": 661703.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 41.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.008324520662426949, "kl": 0.0007139469380490482, "learning_rate": 2.594666666666667e-06, "loss": 0.0, "num_tokens": 661975.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 41.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.1106836125254631, "kl": 0.016338031506165862, "learning_rate": 2.5943333333333333e-06, "loss": 0.0009, "num_tokens": 662299.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 41.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.22612765431404114, "kl": 0.026517196791246533, "learning_rate": 2.594e-06, "loss": 0.0014, "num_tokens": 662640.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.06424561142921448, "kl": 0.0029029519064351916, "learning_rate": 2.5936666666666664e-06, "loss": 0.0002, "num_tokens": 662888.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.3038100004196167, "kl": 0.07607274036854506, "learning_rate": 2.5933333333333336e-06, "loss": 0.0033, "num_tokens": 663185.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 41.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.02874799631536007, "kl": 0.09451911970973015, "learning_rate": 2.593e-06, "loss": 0.0047, "num_tokens": 663549.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 41.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.02398524433374405, "kl": 0.0052295564673841, "learning_rate": 2.5926666666666668e-06, "loss": 0.0003, "num_tokens": 663819.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.18163734674453735, "kl": 0.047916144132614136, "learning_rate": 2.5923333333333336e-06, "loss": 0.0024, "num_tokens": 664088.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 41.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.3485828936100006, "kl": 0.042738866060972214, "learning_rate": 2.592e-06, "loss": 0.0022, "num_tokens": 664384.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 41.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 2.4673593044281006, "kl": 0.13859406113624573, "learning_rate": 2.5916666666666667e-06, "loss": 0.0072, "num_tokens": 664656.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 41.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06750496476888657, "kl": 0.007245576241984963, "learning_rate": 2.5913333333333335e-06, "loss": 0.0004, "num_tokens": 664961.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 41.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07238642871379852, "kl": 0.022575938142836094, "learning_rate": 2.5910000000000003e-06, "loss": 0.0011, "num_tokens": 665289.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 41.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.006112195551395416, "kl": 0.0010635495418682694, "learning_rate": 2.5906666666666666e-06, "loss": 0.0001, "num_tokens": 665549.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 41.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.1886634826660156, "kl": 0.01873736083507538, "learning_rate": 2.5903333333333334e-06, "loss": -0.0017, "num_tokens": 665845.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 41.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.028272408992052078, "kl": 0.0009019679855555296, "learning_rate": 2.5899999999999998e-06, "loss": 0.0, "num_tokens": 666079.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 41.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.06380181014537811, "kl": 0.0018694892642088234, "learning_rate": 2.5896666666666665e-06, "loss": 0.0001, "num_tokens": 666335.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 41.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.2883283793926239, "kl": 0.06977497786283493, "learning_rate": 2.5893333333333338e-06, "loss": 0.0033, "num_tokens": 666649.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 41.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.06622041761875153, "kl": 0.002407582842351985, "learning_rate": 2.589e-06, "loss": 0.0001, "num_tokens": 666921.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 41.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 4.309058666229248, "kl": 0.09977884218096733, "learning_rate": 2.588666666666667e-06, "loss": 0.0659, "num_tokens": 667261.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 41.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.4508535861968994, "kl": 0.032848202623426914, "learning_rate": 2.5883333333333333e-06, "loss": 0.2215, "num_tokens": 667715.0, "reward": 2.174999952316284, "reward_std": 1.649999976158142, "rewards/reward_combined/mean": 2.174999952316284, "rewards/reward_combined/std": 1.649999976158142, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 41.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 3.3594186305999756, "kl": 0.10897567868232727, "learning_rate": 2.588e-06, "loss": 0.0362, "num_tokens": 668065.0, "reward": 4.5, "reward_std": 2.0, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.0, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 41.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.013281864114105701, "kl": 0.0006109159730840474, "learning_rate": 2.587666666666667e-06, "loss": 0.0, "num_tokens": 668382.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 41.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12697184085845947, "kl": 0.005401700735092163, "learning_rate": 2.5873333333333336e-06, "loss": 0.0003, "num_tokens": 668626.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 41.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 7.261502742767334, "kl": 0.06200794130563736, "learning_rate": 2.587e-06, "loss": -0.0072, "num_tokens": 668915.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 41.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.018267983570694923, "kl": 0.16118168830871582, "learning_rate": 2.5866666666666667e-06, "loss": 0.0081, "num_tokens": 669223.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 41.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.27868735790252686, "kl": 0.04014579672366381, "learning_rate": 2.5863333333333335e-06, "loss": 0.002, "num_tokens": 669550.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 41.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04165789112448692, "kl": 0.012051904574036598, "learning_rate": 2.586e-06, "loss": 0.0006, "num_tokens": 669818.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 41.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04134419187903404, "kl": 0.00279614538885653, "learning_rate": 2.5856666666666667e-06, "loss": 0.0001, "num_tokens": 670124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2244 }, { "clip_ratio/high_max": 0.004629629664123058, "clip_ratio/high_mean": 0.004629629664123058, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004629629664123058, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 41.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 2.3592734336853027, "kl": 0.06017509289085865, "learning_rate": 2.5853333333333335e-06, "loss": 0.1433, "num_tokens": 670488.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 41.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.31126633286476135, "kl": 0.022451738826930523, "learning_rate": 2.5850000000000002e-06, "loss": 0.0011, "num_tokens": 670750.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 41.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.16258597373962402, "kl": 0.057327122427523136, "learning_rate": 2.5846666666666666e-06, "loss": 0.0028, "num_tokens": 671069.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 41.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.03757637366652489, "kl": 0.00690287712495774, "learning_rate": 2.5843333333333334e-06, "loss": 0.0003, "num_tokens": 671358.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 41.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.004391350317746401, "kl": 0.015965756960213184, "learning_rate": 2.5839999999999997e-06, "loss": 0.0008, "num_tokens": 671618.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 41.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012767374282702804, "kl": 0.0011478961096145213, "learning_rate": 2.5836666666666665e-06, "loss": 0.0001, "num_tokens": 671898.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.18727311491966248, "kl": 0.0045321062207221985, "learning_rate": 2.5833333333333337e-06, "loss": 0.0002, "num_tokens": 672110.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.048203155398368835, "kl": 0.009726321091875434, "learning_rate": 2.583e-06, "loss": 0.0005, "num_tokens": 672394.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 41.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.02354181371629238, "kl": 0.001652780920267105, "learning_rate": 2.582666666666667e-06, "loss": 0.0001, "num_tokens": 672706.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 41.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.022749602794647217, "kl": 0.0013177543878555298, "learning_rate": 2.5823333333333332e-06, "loss": 0.0001, "num_tokens": 672912.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.11462816596031189, "kl": 0.011050291825085878, "learning_rate": 2.582e-06, "loss": 0.0006, "num_tokens": 673197.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 41.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.2037692070007324, "kl": 1.0258885622024536, "learning_rate": 2.581666666666667e-06, "loss": 0.0678, "num_tokens": 673502.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 41.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.5264981389045715, "kl": 0.12669705785810947, "learning_rate": 2.5813333333333336e-06, "loss": 0.0065, "num_tokens": 673858.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 41.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.08587709069252014, "kl": 0.009181471075862646, "learning_rate": 2.581e-06, "loss": 0.0005, "num_tokens": 674147.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 9.749634742736816, "kl": 0.12228460609912872, "learning_rate": 2.5806666666666667e-06, "loss": 0.1727, "num_tokens": 674433.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 41.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 5.5517964363098145, "kl": 0.4932323209941387, "learning_rate": 2.5803333333333335e-06, "loss": 0.0077, "num_tokens": 674795.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.12199553847312927, "kl": 0.01308013778179884, "learning_rate": 2.58e-06, "loss": 0.0006, "num_tokens": 675088.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.004929345101118088, "kl": 0.0030177757143974304, "learning_rate": 2.5796666666666666e-06, "loss": 0.0002, "num_tokens": 675304.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2262 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 41.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 5.304366111755371, "kl": 0.07195407338440418, "learning_rate": 2.5793333333333334e-06, "loss": 0.0025, "num_tokens": 675602.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 41.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.07109731435775757, "kl": 0.0060296617448329926, "learning_rate": 2.579e-06, "loss": 0.0003, "num_tokens": 675930.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 41.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.16094832122325897, "kl": 0.014843345154076815, "learning_rate": 2.5786666666666666e-06, "loss": 0.0009, "num_tokens": 676196.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018008019542321563, "kl": 7.145851850509644e-05, "learning_rate": 2.5783333333333334e-06, "loss": 0.0, "num_tokens": 676416.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.17287349700927734, "kl": 0.005106365540996194, "learning_rate": 2.5779999999999997e-06, "loss": 0.0002, "num_tokens": 676634.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 42.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007543537649326026, "kl": 0.0004306808114051819, "learning_rate": 2.577666666666667e-06, "loss": 0.0, "num_tokens": 676894.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 42.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 1.6143964529037476, "kl": 0.1721301469951868, "learning_rate": 2.5773333333333337e-06, "loss": 0.0082, "num_tokens": 677194.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 42.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.027700692415237427, "kl": 0.153845876455307, "learning_rate": 2.577e-06, "loss": 0.0077, "num_tokens": 677505.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 42.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01927250064909458, "kl": 0.0012126043438911438, "learning_rate": 2.576666666666667e-06, "loss": 0.0001, "num_tokens": 677713.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 42.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.005752946250140667, "kl": 6.977468729019165e-05, "learning_rate": 2.576333333333333e-06, "loss": 0.0, "num_tokens": 677925.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.07913000136613846, "kl": 0.008182714227586985, "learning_rate": 2.576e-06, "loss": 0.0004, "num_tokens": 678199.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 42.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 3.6019484996795654, "kl": 0.11625132523477077, "learning_rate": 2.5756666666666668e-06, "loss": 0.1041, "num_tokens": 678535.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 83.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 83.5, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 42.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 3.0931789875030518, "kl": 0.022031554020941257, "learning_rate": 2.5753333333333336e-06, "loss": 0.438, "num_tokens": 679097.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 42.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.27651286125183105, "kl": 0.051644254475831985, "learning_rate": 2.575e-06, "loss": 0.0026, "num_tokens": 679423.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 42.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.006301445886492729, "kl": 0.00047616162919439375, "learning_rate": 2.5746666666666667e-06, "loss": 0.0, "num_tokens": 679695.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 42.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.044776782393455505, "kl": 0.039941176772117615, "learning_rate": 2.5743333333333335e-06, "loss": 0.002, "num_tokens": 680099.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 42.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.366850852966309, "kl": 0.018985837697982788, "learning_rate": 2.574e-06, "loss": 0.1862, "num_tokens": 680375.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 42.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.06638304889202118, "kl": 0.002032451331615448, "learning_rate": 2.573666666666667e-06, "loss": 0.0001, "num_tokens": 680635.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 42.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.003982122987508774, "kl": 0.0008017778454814106, "learning_rate": 2.5733333333333334e-06, "loss": 0.0, "num_tokens": 680895.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 42.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.8614780902862549, "kl": 0.13447848334908485, "learning_rate": 2.573e-06, "loss": 0.0069, "num_tokens": 681219.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 42.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.7912726402282715, "kl": 0.12185978144407272, "learning_rate": 2.5726666666666665e-06, "loss": -0.0597, "num_tokens": 681527.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 42.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.3246501088142395, "kl": 0.03552582301199436, "learning_rate": 2.5723333333333333e-06, "loss": 0.0018, "num_tokens": 681863.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 42.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.5744963884353638, "kl": 0.041536884382367134, "learning_rate": 2.572e-06, "loss": 0.0022, "num_tokens": 682159.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 42.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.11229992657899857, "kl": 0.04291626671329141, "learning_rate": 2.571666666666667e-06, "loss": 0.0021, "num_tokens": 682449.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 42.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.006222181487828493, "kl": 0.015481793321669102, "learning_rate": 2.5713333333333337e-06, "loss": 0.0008, "num_tokens": 682709.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.030857743695378304, "kl": 0.0016200148966163397, "learning_rate": 2.571e-06, "loss": 0.0001, "num_tokens": 683005.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 42.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.10765678435564041, "kl": 0.0046082064509391785, "learning_rate": 2.570666666666667e-06, "loss": 0.0002, "num_tokens": 683225.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 42.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.011586115695536137, "kl": 0.0014963998110033572, "learning_rate": 2.570333333333333e-06, "loss": 0.0001, "num_tokens": 683509.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 42.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.09506388008594513, "kl": 0.009406433149706572, "learning_rate": 2.57e-06, "loss": 0.0005, "num_tokens": 683808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 42.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06232907250523567, "kl": 0.01271789101883769, "learning_rate": 2.5696666666666667e-06, "loss": 0.0006, "num_tokens": 684127.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 42.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.17157714068889618, "kl": 0.04104907996952534, "learning_rate": 2.5693333333333335e-06, "loss": 0.002, "num_tokens": 684397.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 42.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.2651209831237793, "kl": 0.034470973536372185, "learning_rate": 2.569e-06, "loss": 0.0021, "num_tokens": 684679.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 42.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.12067551165819168, "kl": 0.011610866524279118, "learning_rate": 2.5686666666666667e-06, "loss": 0.0007, "num_tokens": 685012.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 42.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.12334544211626053, "kl": 0.012893387116491795, "learning_rate": 2.5683333333333334e-06, "loss": 0.0006, "num_tokens": 685272.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 42.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06318001449108124, "kl": 0.008761949837207794, "learning_rate": 2.568e-06, "loss": 0.0004, "num_tokens": 685488.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 42.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.042493775486946106, "kl": 0.008366410853341222, "learning_rate": 2.567666666666667e-06, "loss": 0.0004, "num_tokens": 685778.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 42.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 5.58232307434082, "kl": 0.09087443561293185, "learning_rate": 2.5673333333333334e-06, "loss": 0.011, "num_tokens": 686046.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 42.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.00357165839523077, "kl": 0.00012801885895896703, "learning_rate": 2.567e-06, "loss": 0.0, "num_tokens": 686266.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 42.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.033808790147304535, "kl": 0.007193173747509718, "learning_rate": 2.5666666666666665e-06, "loss": 0.0004, "num_tokens": 686554.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.05844135582447052, "kl": 0.009986089775338769, "learning_rate": 2.5663333333333333e-06, "loss": 0.0005, "num_tokens": 686826.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 42.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.03625547140836716, "kl": 0.006588557502254844, "learning_rate": 2.566e-06, "loss": 0.0003, "num_tokens": 687169.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 42.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 4.191218852996826, "kl": 0.07684733346104622, "learning_rate": 2.565666666666667e-06, "loss": 0.0379, "num_tokens": 687499.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 42.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.030018970370292664, "kl": 0.09379787370562553, "learning_rate": 2.5653333333333336e-06, "loss": 0.0047, "num_tokens": 687863.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014705882407724857, "clip_ratio/low_min": 0.014705882407724857, "clip_ratio/region_mean": 0.014705882407724857, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 42.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 8.721210479736328, "kl": 0.02078762650489807, "learning_rate": 2.565e-06, "loss": 0.4353, "num_tokens": 688109.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 42.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.08929096907377243, "kl": 0.003990313387475908, "learning_rate": 2.564666666666667e-06, "loss": 0.0002, "num_tokens": 688365.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 42.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.015159958973526955, "kl": 0.0008501690026605502, "learning_rate": 2.564333333333333e-06, "loss": 0.0, "num_tokens": 688684.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 42.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.026792660355567932, "kl": 0.0021284203976392746, "learning_rate": 2.564e-06, "loss": 0.0001, "num_tokens": 688996.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 42.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.0151569843292236, "kl": 0.06188581883907318, "learning_rate": 2.5636666666666667e-06, "loss": -0.0628, "num_tokens": 689341.0, "reward": 6.5, "reward_std": 2.345207929611206, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.345207929611206, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03259953111410141, "kl": 0.009544974192976952, "learning_rate": 2.5633333333333335e-06, "loss": 0.0005, "num_tokens": 689609.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 42.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.057312652468681335, "kl": 0.016362751834094524, "learning_rate": 2.563e-06, "loss": 0.0008, "num_tokens": 689911.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 42.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.005066038575023413, "kl": 0.0006015742546878755, "learning_rate": 2.5626666666666666e-06, "loss": 0.0, "num_tokens": 690154.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 42.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 6.187950611114502, "kl": 0.04966283682733774, "learning_rate": 2.5623333333333334e-06, "loss": 0.0011, "num_tokens": 690490.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 42.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.00905273575335741, "kl": 0.002031169831752777, "learning_rate": 2.562e-06, "loss": 0.0001, "num_tokens": 690726.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 42.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 2.1744160652160645, "kl": 0.10203511267900467, "learning_rate": 2.561666666666667e-06, "loss": 0.0621, "num_tokens": 691074.0, "reward": 4.25, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 2.1794495582580566, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.06537610292434692, "kl": 0.005710832541808486, "learning_rate": 2.5613333333333333e-06, "loss": 0.0003, "num_tokens": 691351.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0857694149017334, "kl": 0.010313042905181646, "learning_rate": 2.561e-06, "loss": 0.0005, "num_tokens": 691633.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 42.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.25820374488830566, "kl": 0.06344599276781082, "learning_rate": 2.5606666666666665e-06, "loss": 0.0031, "num_tokens": 691979.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 42.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.061222560703754425, "kl": 0.0075517280492931604, "learning_rate": 2.5603333333333333e-06, "loss": 0.0004, "num_tokens": 692278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 42.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.756105899810791, "kl": 0.1902681589126587, "learning_rate": 2.56e-06, "loss": 0.0247, "num_tokens": 692583.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 43.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.022758089005947113, "kl": 0.0007738002750556916, "learning_rate": 2.559666666666667e-06, "loss": 0.0, "num_tokens": 692819.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 43.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.667349100112915, "kl": 0.08784319180995226, "learning_rate": 2.5593333333333336e-06, "loss": 0.0072, "num_tokens": 693125.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 43.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.2427816092967987, "kl": 0.030141443479806185, "learning_rate": 2.559e-06, "loss": 0.0016, "num_tokens": 693396.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 43.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04597758874297142, "kl": 0.0011098682880401611, "learning_rate": 2.5586666666666668e-06, "loss": 0.0001, "num_tokens": 693608.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 43.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 5.068799018859863, "kl": 0.030716415494680405, "learning_rate": 2.558333333333333e-06, "loss": 0.0109, "num_tokens": 693915.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 43.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.02640840783715248, "kl": 0.0005000904202461243, "learning_rate": 2.5580000000000003e-06, "loss": 0.0, "num_tokens": 694123.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 43.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 5.983851432800293, "kl": 0.016102399677038193, "learning_rate": 2.5576666666666667e-06, "loss": 0.1011, "num_tokens": 694465.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 43.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.007450214121490717, "kl": 0.00421206234022975, "learning_rate": 2.5573333333333335e-06, "loss": 0.0002, "num_tokens": 694733.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 43.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.13418050110340118, "kl": 0.003793664276599884, "learning_rate": 2.5570000000000003e-06, "loss": 0.0002, "num_tokens": 694977.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 43.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 7.633325576782227, "kl": 0.08090226771309972, "learning_rate": 2.5566666666666666e-06, "loss": 0.002, "num_tokens": 695271.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2331 }, { "clip_ratio/high_max": 0.021739130839705467, "clip_ratio/high_mean": 0.021739130839705467, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021739130839705467, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 43.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 7.948868751525879, "kl": 0.02674010396003723, "learning_rate": 2.5563333333333334e-06, "loss": 0.1643, "num_tokens": 695555.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 43.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02924892120063305, "kl": 0.04285065270960331, "learning_rate": 2.556e-06, "loss": 0.0021, "num_tokens": 695959.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.4958176612854, "kl": 0.26358838722808287, "learning_rate": 2.555666666666667e-06, "loss": 0.0683, "num_tokens": 696246.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 43.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10956063121557236, "kl": 0.013329327572137117, "learning_rate": 2.5553333333333333e-06, "loss": 0.0007, "num_tokens": 696506.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 43.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.6764758229255676, "kl": 0.07917577400803566, "learning_rate": 2.555e-06, "loss": 0.0042, "num_tokens": 696872.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.057281970977783, "kl": 0.028899877332150936, "learning_rate": 2.5546666666666665e-06, "loss": 0.0031, "num_tokens": 697144.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 43.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02428019791841507, "kl": 0.09250488132238388, "learning_rate": 2.5543333333333332e-06, "loss": 0.0046, "num_tokens": 697510.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 43.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0939767062664032, "kl": 0.02160784974694252, "learning_rate": 2.554e-06, "loss": 0.0011, "num_tokens": 697826.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 43.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.06708146631717682, "kl": 0.005683839903213084, "learning_rate": 2.553666666666667e-06, "loss": 0.0003, "num_tokens": 698103.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 43.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.01883876323699951, "kl": 0.00022596716735279188, "learning_rate": 2.5533333333333336e-06, "loss": 0.0, "num_tokens": 698359.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 43.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.05525289848446846, "kl": 0.003440507600316778, "learning_rate": 2.553e-06, "loss": 0.0002, "num_tokens": 698623.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 43.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023331388365477324, "kl": 0.0014758408069610596, "learning_rate": 2.5526666666666667e-06, "loss": 0.0001, "num_tokens": 698935.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.09795980155467987, "kl": 0.02744582900777459, "learning_rate": 2.552333333333333e-06, "loss": 0.0013, "num_tokens": 699210.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 43.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.006187402177602053, "kl": 0.01566222310066223, "learning_rate": 2.5520000000000003e-06, "loss": 0.0008, "num_tokens": 699470.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 43.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.402616024017334, "kl": 0.03923051059246063, "learning_rate": 2.5516666666666667e-06, "loss": 0.0159, "num_tokens": 699850.0, "reward": 3.875, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 4.190763473510742, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 43.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 6.186767578125, "kl": 0.08794543892145157, "learning_rate": 2.5513333333333334e-06, "loss": 0.2411, "num_tokens": 700073.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 43.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.2174459546804428, "kl": 0.02345143910497427, "learning_rate": 2.5510000000000002e-06, "loss": 0.0012, "num_tokens": 700365.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 43.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.06970176845788956, "kl": 0.0035487039713189006, "learning_rate": 2.5506666666666666e-06, "loss": 0.0002, "num_tokens": 700632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 43.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.002296097343787551, "kl": 0.0004628002643585205, "learning_rate": 2.5503333333333334e-06, "loss": 0.0, "num_tokens": 700892.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 43.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 1.1953400373458862, "kl": 0.09098661225289106, "learning_rate": 2.55e-06, "loss": 0.0038, "num_tokens": 701166.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 43.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.3264387845993042, "kl": 0.030001981183886528, "learning_rate": 2.549666666666667e-06, "loss": 0.001, "num_tokens": 701420.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 43.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.20210789144039154, "kl": 0.053033437579870224, "learning_rate": 2.5493333333333333e-06, "loss": 0.0026, "num_tokens": 701774.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 43.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.4588561058044434, "kl": 0.08353149518370628, "learning_rate": 2.549e-06, "loss": -0.0189, "num_tokens": 702108.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 43.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.009912577457726002, "kl": 0.00048766733380034566, "learning_rate": 2.5486666666666664e-06, "loss": 0.0, "num_tokens": 702428.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 43.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.09448502212762833, "kl": 0.007809346076101065, "learning_rate": 2.5483333333333332e-06, "loss": 0.0004, "num_tokens": 702662.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 43.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.01845453679561615, "kl": 0.2656458467245102, "learning_rate": 2.5480000000000004e-06, "loss": 0.0133, "num_tokens": 702966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 43.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 9.126014447247144e-06, "kl": 2.4139881134033203e-06, "learning_rate": 2.5476666666666668e-06, "loss": 0.0, "num_tokens": 703186.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.015084311366081238, "kl": 0.0018152159755118191, "learning_rate": 2.5473333333333336e-06, "loss": 0.0001, "num_tokens": 703470.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 43.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.010754057206213474, "kl": 0.0018049031496047974, "learning_rate": 2.547e-06, "loss": 0.0001, "num_tokens": 703706.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 43.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.30488014221191406, "kl": 0.022927945014089346, "learning_rate": 2.5466666666666667e-06, "loss": 0.0011, "num_tokens": 704040.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.1744215041399002, "kl": 0.023055229801684618, "learning_rate": 2.5463333333333335e-06, "loss": 0.001, "num_tokens": 704338.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 43.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.003176689147949, "kl": 0.08643694035708904, "learning_rate": 2.5460000000000003e-06, "loss": 0.0184, "num_tokens": 704627.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.12822288274765015, "kl": 0.009080796968191862, "learning_rate": 2.5456666666666666e-06, "loss": 0.0005, "num_tokens": 704925.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 43.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10814723372459412, "kl": 0.022527985274791718, "learning_rate": 2.5453333333333334e-06, "loss": 0.0011, "num_tokens": 705261.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 43.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.10927625000476837, "kl": 0.002664215862751007, "learning_rate": 2.545e-06, "loss": 0.0001, "num_tokens": 705474.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.1003035306930542, "kl": 0.007294411770999432, "learning_rate": 2.5446666666666666e-06, "loss": 0.0004, "num_tokens": 705762.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 43.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 3.265911102294922, "kl": 0.05548745393753052, "learning_rate": 2.5443333333333333e-06, "loss": 0.0515, "num_tokens": 706100.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 43.870370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 4.68215274810791, "kl": 0.057888234965503216, "learning_rate": 2.544e-06, "loss": 0.0431, "num_tokens": 706389.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 43.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.4367368519306183, "kl": 0.051830656826496124, "learning_rate": 2.543666666666667e-06, "loss": 0.0026, "num_tokens": 706661.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 43.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.033806316554546356, "kl": 0.005122621078044176, "learning_rate": 2.5433333333333333e-06, "loss": 0.0003, "num_tokens": 706997.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 43.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.005678814835846424, "kl": 0.0001820743127609603, "learning_rate": 2.543e-06, "loss": 0.0, "num_tokens": 707217.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 43.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03096291422843933, "kl": 0.15496885776519775, "learning_rate": 2.5426666666666664e-06, "loss": 0.0077, "num_tokens": 707527.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 43.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.006253744009882212, "kl": 0.0012376871309243143, "learning_rate": 2.542333333333333e-06, "loss": 0.0001, "num_tokens": 707839.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 43.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.815692901611328, "kl": 0.04098579101264477, "learning_rate": 2.5420000000000004e-06, "loss": -0.0823, "num_tokens": 708142.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.24482864141464233, "kl": 0.018099462613463402, "learning_rate": 2.5416666666666668e-06, "loss": 0.0009, "num_tokens": 708438.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 44.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.027715403586626053, "kl": 0.09474433213472366, "learning_rate": 2.5413333333333335e-06, "loss": 0.0047, "num_tokens": 708802.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 44.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.7770018577575684, "kl": 0.16026543080806732, "learning_rate": 2.541e-06, "loss": 0.05, "num_tokens": 709143.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.010741172358393669, "kl": 0.001871950924396515, "learning_rate": 2.5406666666666667e-06, "loss": 0.0001, "num_tokens": 709379.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 44.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 2.987513780593872, "kl": 0.0055997485760599375, "learning_rate": 2.5403333333333335e-06, "loss": 0.1528, "num_tokens": 709732.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 44.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.13215208053588867, "kl": 0.005676170578226447, "learning_rate": 2.5400000000000002e-06, "loss": 0.0003, "num_tokens": 709965.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 44.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.1867983490228653, "kl": 0.03015311900526285, "learning_rate": 2.5396666666666666e-06, "loss": 0.0015, "num_tokens": 710269.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 44.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.26857686042785645, "kl": 0.02790098451077938, "learning_rate": 2.5393333333333334e-06, "loss": 0.0014, "num_tokens": 710541.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 44.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.031219320371747017, "kl": 0.007754998980090022, "learning_rate": 2.539e-06, "loss": 0.0004, "num_tokens": 710832.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 44.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 7.330705642700195, "kl": 0.05509260483086109, "learning_rate": 2.5386666666666665e-06, "loss": 0.0654, "num_tokens": 711154.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 44.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.027127431705594063, "kl": 0.00034999846684513614, "learning_rate": 2.5383333333333333e-06, "loss": 0.0, "num_tokens": 711410.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 44.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.7651302814483643, "kl": 0.030204717069864273, "learning_rate": 2.538e-06, "loss": -0.0754, "num_tokens": 711762.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 44.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.02672317810356617, "kl": 0.006926621310412884, "learning_rate": 2.537666666666667e-06, "loss": 0.0003, "num_tokens": 712066.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.018026020377874374, "kl": 0.00473035522736609, "learning_rate": 2.5373333333333332e-06, "loss": 0.0002, "num_tokens": 712336.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 44.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.20849008858203888, "kl": 0.0290701761841774, "learning_rate": 2.537e-06, "loss": 0.0014, "num_tokens": 712606.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.6695441007614136, "kl": 0.04420142062008381, "learning_rate": 2.5366666666666664e-06, "loss": 0.0024, "num_tokens": 712820.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 44.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.3384666442871094, "kl": 0.07243164023384452, "learning_rate": 2.5363333333333336e-06, "loss": 0.1104, "num_tokens": 713193.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 44.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.241629958152771, "kl": 0.06414328143000603, "learning_rate": 2.5360000000000004e-06, "loss": 0.0032, "num_tokens": 713519.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 44.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 4.9352216720581055, "kl": 0.03370051831007004, "learning_rate": 2.5356666666666667e-06, "loss": 0.0095, "num_tokens": 713811.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 44.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.034529946744441986, "kl": 0.1577390357851982, "learning_rate": 2.5353333333333335e-06, "loss": 0.0079, "num_tokens": 714119.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 44.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.007353818975389004, "kl": 0.010421198792755604, "learning_rate": 2.535e-06, "loss": 0.0005, "num_tokens": 714391.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 3.908125400543213, "kl": 0.05387901654466987, "learning_rate": 2.5346666666666667e-06, "loss": -0.0429, "num_tokens": 714683.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.09389282017946243, "kl": 0.0027054548263549805, "learning_rate": 2.5343333333333334e-06, "loss": 0.0001, "num_tokens": 714903.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 44.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.32091230154037476, "kl": 0.04450591653585434, "learning_rate": 2.5340000000000002e-06, "loss": 0.0022, "num_tokens": 715172.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 44.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03132433071732521, "kl": 0.003305246355012059, "learning_rate": 2.5336666666666666e-06, "loss": 0.0002, "num_tokens": 715502.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 44.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.561185359954834, "kl": 0.04197950102388859, "learning_rate": 2.5333333333333334e-06, "loss": -0.0271, "num_tokens": 715804.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 44.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.00231740134768188, "kl": 0.0011081545962952077, "learning_rate": 2.533e-06, "loss": 0.0001, "num_tokens": 716084.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 44.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033509547356516123, "kl": 0.016214151866734028, "learning_rate": 2.5326666666666665e-06, "loss": 0.0008, "num_tokens": 716344.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 44.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.031681910157203674, "kl": 0.0019194036722183228, "learning_rate": 2.5323333333333337e-06, "loss": 0.0001, "num_tokens": 716556.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 44.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03134976699948311, "kl": 0.002907024696469307, "learning_rate": 2.532e-06, "loss": 0.0001, "num_tokens": 716868.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 44.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.010179928503930569, "kl": 0.00043118372559547424, "learning_rate": 2.531666666666667e-06, "loss": 0.0, "num_tokens": 717112.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 44.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.028442008420825005, "kl": 0.0012857671245001256, "learning_rate": 2.531333333333333e-06, "loss": 0.0001, "num_tokens": 717435.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 44.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.056585948914289474, "kl": 0.01396835083141923, "learning_rate": 2.531e-06, "loss": 0.0007, "num_tokens": 717760.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 44.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.019285930320620537, "kl": 0.2654702961444855, "learning_rate": 2.5306666666666668e-06, "loss": 0.0133, "num_tokens": 718064.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 44.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 5.0360283851623535, "kl": 0.1022005844861269, "learning_rate": 2.5303333333333336e-06, "loss": -0.0013, "num_tokens": 718467.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 44.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 7.005399227142334, "kl": 0.019845019094645977, "learning_rate": 2.5300000000000003e-06, "loss": 0.1911, "num_tokens": 718758.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.031161511316895485, "kl": 0.007949382066726685, "learning_rate": 2.5296666666666667e-06, "loss": 0.0004, "num_tokens": 719026.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.68518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 5.131880760192871, "kl": 0.0300514732953161, "learning_rate": 2.5293333333333335e-06, "loss": 0.0513, "num_tokens": 719324.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11699317395687103, "kl": 0.017543671652674675, "learning_rate": 2.529e-06, "loss": 0.0009, "num_tokens": 719598.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 44.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0197089072316885, "kl": 0.00172106281388551, "learning_rate": 2.5286666666666666e-06, "loss": 0.0001, "num_tokens": 719866.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 44.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005962066818028688, "kl": 0.0010440730256959796, "learning_rate": 2.5283333333333334e-06, "loss": 0.0001, "num_tokens": 720178.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.009605719707906246, "kl": 0.003335796296596527, "learning_rate": 2.528e-06, "loss": 0.0002, "num_tokens": 720394.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 44.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.013398813083767891, "kl": 0.016179578378796577, "learning_rate": 2.5276666666666665e-06, "loss": 0.0008, "num_tokens": 720678.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 44.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.3021674156188965, "kl": 0.11446773260831833, "learning_rate": 2.5273333333333333e-06, "loss": -0.1335, "num_tokens": 721012.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 44.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 9.252238273620605, "kl": 0.021312411059625447, "learning_rate": 2.527e-06, "loss": 0.1645, "num_tokens": 721282.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.07279378920793533, "kl": 0.0004621744155883789, "learning_rate": 2.5266666666666665e-06, "loss": 0.0, "num_tokens": 721494.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 44.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.008217846974730492, "kl": 0.0005680881440639496, "learning_rate": 2.5263333333333337e-06, "loss": 0.0, "num_tokens": 721754.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 44.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.2470250427722931, "kl": 0.03967873938381672, "learning_rate": 2.526e-06, "loss": 0.002, "num_tokens": 722092.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 44.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047119478695094585, "kl": 0.0016926114330999553, "learning_rate": 2.525666666666667e-06, "loss": 0.0001, "num_tokens": 722376.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 44.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.00697978725656867, "kl": 0.0017822146764956415, "learning_rate": 2.525333333333333e-06, "loss": 0.0001, "num_tokens": 722636.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.10732169449329376, "kl": 0.0029513761401176453, "learning_rate": 2.525e-06, "loss": 0.0001, "num_tokens": 722855.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 44.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.628994941711426, "kl": 0.37450834130868316, "learning_rate": 2.5246666666666667e-06, "loss": 0.0439, "num_tokens": 723158.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 44.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.197968006134033, "kl": 0.04597779922187328, "learning_rate": 2.5243333333333335e-06, "loss": 0.0038, "num_tokens": 723491.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 73.0, "completions/mean_terminated_length": 73.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 44.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.2310808897018433, "kl": 0.019989359192550182, "learning_rate": 2.5240000000000003e-06, "loss": 0.4263, "num_tokens": 724003.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.15655246376991272, "kl": 0.02584015391767025, "learning_rate": 2.5236666666666667e-06, "loss": 0.0013, "num_tokens": 724289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 45.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0297035351395607, "kl": 0.007721581496298313, "learning_rate": 2.5233333333333335e-06, "loss": 0.0004, "num_tokens": 724557.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 45.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03807724267244339, "kl": 0.00707882852293551, "learning_rate": 2.523e-06, "loss": 0.0004, "num_tokens": 724901.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.009172759018838406, "kl": 0.003193281590938568, "learning_rate": 2.5226666666666666e-06, "loss": 0.0002, "num_tokens": 725117.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 45.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.006181388162076473, "kl": 0.0004881687054876238, "learning_rate": 2.5223333333333334e-06, "loss": 0.0, "num_tokens": 725435.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.18731491267681122, "kl": 0.029865404590964317, "learning_rate": 2.522e-06, "loss": 0.0013, "num_tokens": 725731.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 45.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.02235090732574463, "kl": 0.0006565302610397339, "learning_rate": 2.5216666666666665e-06, "loss": 0.0, "num_tokens": 725943.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 45.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 4.440337657928467, "kl": 0.07701127231121063, "learning_rate": 2.5213333333333333e-06, "loss": -0.0576, "num_tokens": 726277.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 45.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.10574109107255936, "kl": 0.02136577805504203, "learning_rate": 2.521e-06, "loss": 0.0011, "num_tokens": 726551.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.010525328107178211, "kl": 0.001914285123348236, "learning_rate": 2.520666666666667e-06, "loss": 0.0001, "num_tokens": 726787.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 45.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.23817087709903717, "kl": 0.04681949131190777, "learning_rate": 2.5203333333333337e-06, "loss": 0.0023, "num_tokens": 727090.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 45.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09021669626235962, "kl": 0.011988789541646838, "learning_rate": 2.52e-06, "loss": 0.0006, "num_tokens": 727392.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 45.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.986919403076172, "kl": 0.2177446000277996, "learning_rate": 2.519666666666667e-06, "loss": 0.0397, "num_tokens": 727653.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 45.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.1099245473742485, "kl": 0.0067427074536681175, "learning_rate": 2.519333333333333e-06, "loss": 0.0003, "num_tokens": 727949.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 45.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.005288613494485617, "kl": 0.00045069254701957107, "learning_rate": 2.519e-06, "loss": 0.0, "num_tokens": 728192.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 45.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.16958598792552948, "kl": 0.025498234666883945, "learning_rate": 2.5186666666666667e-06, "loss": 0.0011, "num_tokens": 728457.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 45.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.012915810570120811, "kl": 0.005473255878314376, "learning_rate": 2.5183333333333335e-06, "loss": 0.0003, "num_tokens": 728725.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 45.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 2.935128927230835, "kl": 0.06246868520975113, "learning_rate": 2.5180000000000003e-06, "loss": 0.0636, "num_tokens": 729064.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 45.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 5.11857795715332, "kl": 0.08278805017471313, "learning_rate": 2.5176666666666666e-06, "loss": 0.0594, "num_tokens": 729402.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.07405371963977814, "kl": 0.0010067522525787354, "learning_rate": 2.5173333333333334e-06, "loss": 0.0001, "num_tokens": 729614.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 45.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034407759085297585, "kl": 0.016221345402300358, "learning_rate": 2.5169999999999998e-06, "loss": 0.0008, "num_tokens": 729874.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 45.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.014748414047062397, "kl": 0.0010713711380958557, "learning_rate": 2.516666666666667e-06, "loss": 0.0001, "num_tokens": 730198.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 45.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 4.9868483543396, "kl": 0.040062980726361275, "learning_rate": 2.5163333333333334e-06, "loss": -0.0005, "num_tokens": 730490.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 45.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.28041645884513855, "kl": 0.031283190473914146, "learning_rate": 2.516e-06, "loss": 0.0016, "num_tokens": 730793.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 45.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.12332356721162796, "kl": 0.1614207997918129, "learning_rate": 2.515666666666667e-06, "loss": 0.0081, "num_tokens": 731102.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 45.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.182009220123291, "kl": 0.07591994479298592, "learning_rate": 2.5153333333333333e-06, "loss": 0.0185, "num_tokens": 731437.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.010616044513881207, "kl": 0.0021869930205866694, "learning_rate": 2.515e-06, "loss": 0.0001, "num_tokens": 731721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 45.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0117707634344697, "kl": 0.0005625975900329649, "learning_rate": 2.514666666666667e-06, "loss": 0.0, "num_tokens": 731956.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1691562533378601, "kl": 0.003987833857536316, "learning_rate": 2.5143333333333336e-06, "loss": 0.0002, "num_tokens": 732168.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 45.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0167376808822155, "kl": 0.2660384327173233, "learning_rate": 2.514e-06, "loss": 0.0133, "num_tokens": 732472.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 45.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.33966556191444397, "kl": 0.027728529879823327, "learning_rate": 2.5136666666666668e-06, "loss": 0.0014, "num_tokens": 732737.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 45.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.007064864505082369, "kl": 0.0018771738396026194, "learning_rate": 2.513333333333333e-06, "loss": 0.0001, "num_tokens": 732997.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 45.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 1.0757845640182495, "kl": 0.04385972023010254, "learning_rate": 2.513e-06, "loss": 0.0002, "num_tokens": 733309.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.13577602803707123, "kl": 0.014645958319306374, "learning_rate": 2.5126666666666667e-06, "loss": 0.0007, "num_tokens": 733587.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 45.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038021355867385864, "kl": 0.0007932271109893918, "learning_rate": 2.5123333333333335e-06, "loss": 0.0, "num_tokens": 733867.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 45.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0465419702231884, "kl": 0.005470138741657138, "learning_rate": 2.5120000000000003e-06, "loss": 0.0003, "num_tokens": 734157.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 45.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 2.7946746349334717, "kl": 0.07041575387120247, "learning_rate": 2.5116666666666666e-06, "loss": 0.1658, "num_tokens": 734512.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 1.479394793510437, "kl": 0.17642395664006472, "learning_rate": 2.5113333333333334e-06, "loss": 0.0094, "num_tokens": 734798.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 45.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.376974582672119, "kl": 0.007895383670984302, "learning_rate": 2.5109999999999998e-06, "loss": 0.0378, "num_tokens": 735072.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 45.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 6.2208967208862305, "kl": 0.1544065736234188, "learning_rate": 2.510666666666667e-06, "loss": 0.0481, "num_tokens": 735413.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 45.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.045347291976213455, "kl": 0.01228410005569458, "learning_rate": 2.5103333333333333e-06, "loss": 0.0006, "num_tokens": 735825.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.142083078622818, "kl": 0.025195241440087557, "learning_rate": 2.51e-06, "loss": 0.0015, "num_tokens": 736111.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 45.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.03137398883700371, "kl": 0.09449802339076996, "learning_rate": 2.509666666666667e-06, "loss": 0.0047, "num_tokens": 736475.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 45.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033673427533358335, "kl": 9.293853509007022e-05, "learning_rate": 2.5093333333333333e-06, "loss": 0.0, "num_tokens": 736731.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.03953562676906586, "kl": 0.00045480579137802124, "learning_rate": 2.509e-06, "loss": 0.0, "num_tokens": 736951.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 45.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.09735623747110367, "kl": 0.004153555637458339, "learning_rate": 2.508666666666667e-06, "loss": 0.0002, "num_tokens": 737173.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.15920746326446533, "kl": 0.031207595951855183, "learning_rate": 2.5083333333333336e-06, "loss": 0.0016, "num_tokens": 737442.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 45.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.5551946759223938, "kl": 0.03656729869544506, "learning_rate": 2.508e-06, "loss": 0.0018, "num_tokens": 737738.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 45.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.03540947660803795, "kl": 0.0070354241179302335, "learning_rate": 2.5076666666666667e-06, "loss": 0.0004, "num_tokens": 738061.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 45.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.33373937010765076, "kl": 0.018233067821711302, "learning_rate": 2.507333333333333e-06, "loss": 0.0008, "num_tokens": 738325.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 45.925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 3.7227039337158203, "kl": 0.018040990456938744, "learning_rate": 2.507e-06, "loss": 0.2375, "num_tokens": 738741.0, "reward": 7.375, "reward_std": 0.25, "rewards/reward_combined/mean": 7.375, "rewards/reward_combined/std": 0.25, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 45.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11966013163328171, "kl": 0.021700285375118256, "learning_rate": 2.506666666666667e-06, "loss": 0.0011, "num_tokens": 739065.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 45.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.2690456807613373, "kl": 0.02911150682484731, "learning_rate": 2.5063333333333334e-06, "loss": 0.0015, "num_tokens": 739377.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 45.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.006945215165615082, "kl": 0.0007221311389002949, "learning_rate": 2.5060000000000002e-06, "loss": 0.0, "num_tokens": 739637.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 46.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0390542708337307, "kl": 0.039667438715696335, "learning_rate": 2.5056666666666666e-06, "loss": 0.0019, "num_tokens": 740049.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 46.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011404343240428716, "kl": 3.598630428314209e-06, "learning_rate": 2.5053333333333334e-06, "loss": 0.0, "num_tokens": 740269.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 46.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009822768159210682, "kl": 0.004389554378576577, "learning_rate": 2.505e-06, "loss": 0.0002, "num_tokens": 740539.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 46.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.015896078199148178, "kl": 0.2662335932254791, "learning_rate": 2.504666666666667e-06, "loss": 0.0133, "num_tokens": 740843.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 46.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.05427286773920059, "kl": 0.006968255620449781, "learning_rate": 2.5043333333333333e-06, "loss": 0.0003, "num_tokens": 741146.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 46.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.003982073627412319, "kl": 0.0006556894222740084, "learning_rate": 2.504e-06, "loss": 0.0, "num_tokens": 741426.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 46.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0342152938246727, "kl": 0.00826589995995164, "learning_rate": 2.503666666666667e-06, "loss": 0.0004, "num_tokens": 741714.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 46.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.08570891618728638, "kl": 0.0028475001454353333, "learning_rate": 2.5033333333333332e-06, "loss": 0.0001, "num_tokens": 741922.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 46.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.06338085234165192, "kl": 0.017430400475859642, "learning_rate": 2.503e-06, "loss": 0.0008, "num_tokens": 742243.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 46.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035878820344805717, "kl": 0.016196363605558872, "learning_rate": 2.502666666666667e-06, "loss": 0.0008, "num_tokens": 742503.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 46.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 5.096258640289307, "kl": 0.12530342489480972, "learning_rate": 2.5023333333333336e-06, "loss": 0.1053, "num_tokens": 742829.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 46.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.535659074783325, "kl": 0.02935294434428215, "learning_rate": 2.502e-06, "loss": 0.119, "num_tokens": 743182.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 46.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.26846736669540405, "kl": 0.03729828912764788, "learning_rate": 2.5016666666666667e-06, "loss": 0.0019, "num_tokens": 743456.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 46.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.3158177435398102, "kl": 0.026865395164350048, "learning_rate": 2.501333333333333e-06, "loss": 0.0014, "num_tokens": 743717.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 46.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.376829147338867, "kl": 0.0505690579302609, "learning_rate": 2.501e-06, "loss": -0.0717, "num_tokens": 744063.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 46.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.32744091749191284, "kl": 0.018593482207506895, "learning_rate": 2.500666666666667e-06, "loss": 0.0009, "num_tokens": 744305.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 46.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.012301038019359112, "kl": 0.0015533939003944397, "learning_rate": 2.5003333333333334e-06, "loss": 0.0001, "num_tokens": 744541.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 46.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.16265687346458435, "kl": 0.023410575464367867, "learning_rate": 2.5e-06, "loss": 0.0012, "num_tokens": 744813.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 46.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.1362433135509491, "kl": 0.018554782029241323, "learning_rate": 2.4996666666666666e-06, "loss": 0.001, "num_tokens": 745086.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 46.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 4.151635646820068, "kl": 0.05165325850248337, "learning_rate": 2.4993333333333333e-06, "loss": -0.0348, "num_tokens": 745385.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 46.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 1.7361879348754883, "kl": 0.25426632445305586, "learning_rate": 2.499e-06, "loss": 0.015, "num_tokens": 745697.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 46.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.035286128520965576, "kl": 0.001969415054190904, "learning_rate": 2.498666666666667e-06, "loss": 0.0001, "num_tokens": 745963.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 46.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.12127512693405151, "kl": 0.007447681622579694, "learning_rate": 2.4983333333333333e-06, "loss": 0.0004, "num_tokens": 746259.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 46.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 4.530540466308594, "kl": 0.15745895355939865, "learning_rate": 2.498e-06, "loss": -0.0343, "num_tokens": 746631.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.75, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 46.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.138972282409668, "kl": 0.07470146007835865, "learning_rate": 2.497666666666667e-06, "loss": 0.0621, "num_tokens": 747026.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 46.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.7906563878059387, "kl": 0.14853118360042572, "learning_rate": 2.497333333333333e-06, "loss": 0.0077, "num_tokens": 747359.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.25, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 46.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.6633254885673523, "kl": 0.08115752972662449, "learning_rate": 2.497e-06, "loss": 0.0049, "num_tokens": 747764.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 46.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.39222899079322815, "kl": 0.04715419188141823, "learning_rate": 2.4966666666666668e-06, "loss": 0.0024, "num_tokens": 748096.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 46.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01646861806511879, "kl": 0.0010175041970796883, "learning_rate": 2.4963333333333335e-06, "loss": 0.0001, "num_tokens": 748358.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 46.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.042602717876434326, "kl": 0.004759176634252071, "learning_rate": 2.496e-06, "loss": 0.0002, "num_tokens": 748670.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 46.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.3041357398033142, "kl": 0.06130489706993103, "learning_rate": 2.4956666666666667e-06, "loss": 0.0032, "num_tokens": 748997.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 46.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.007603069767355919, "kl": 8.128583431243896e-05, "learning_rate": 2.495333333333333e-06, "loss": 0.0, "num_tokens": 749209.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 46.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0253154207020998, "kl": 0.09312248229980469, "learning_rate": 2.4950000000000003e-06, "loss": 0.0047, "num_tokens": 749575.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 46.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.11499327421188354, "kl": 0.014323872746899724, "learning_rate": 2.494666666666667e-06, "loss": 0.0008, "num_tokens": 749919.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 46.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.1348366141319275, "kl": 0.005608153063803911, "learning_rate": 2.4943333333333334e-06, "loss": 0.0003, "num_tokens": 750195.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 46.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.04203183576464653, "kl": 0.035331493243575096, "learning_rate": 2.494e-06, "loss": 0.0019, "num_tokens": 750485.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 46.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.19115984439849854, "kl": 0.02633888367563486, "learning_rate": 2.4936666666666665e-06, "loss": 0.0013, "num_tokens": 750783.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 46.68518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 18.35003662109375, "kl": 0.2072160392999649, "learning_rate": 2.4933333333333333e-06, "loss": -0.2274, "num_tokens": 751000.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 46.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.001844356651417911, "kl": 0.0004361694009276107, "learning_rate": 2.493e-06, "loss": 0.0, "num_tokens": 751272.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 46.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.004649453330785036, "kl": 0.001757514022756368, "learning_rate": 2.492666666666667e-06, "loss": 0.0001, "num_tokens": 751556.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 46.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.1287909746170044, "kl": 0.016881283838301897, "learning_rate": 2.4923333333333332e-06, "loss": 0.0008, "num_tokens": 751834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 46.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.2693098187446594, "kl": 0.030407872691284865, "learning_rate": 2.492e-06, "loss": 0.0014, "num_tokens": 752143.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 46.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.1576777994632721, "kl": 0.01178092899499461, "learning_rate": 2.491666666666667e-06, "loss": 0.0006, "num_tokens": 752462.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 46.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.01044461503624916, "kl": 0.0028221234679222107, "learning_rate": 2.491333333333333e-06, "loss": 0.0001, "num_tokens": 752678.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 46.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.08084384351968765, "kl": 0.008157667936757207, "learning_rate": 2.4910000000000004e-06, "loss": 0.0005, "num_tokens": 752937.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 46.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.09120894968509674, "kl": 0.004084435146069154, "learning_rate": 2.4906666666666667e-06, "loss": 0.0002, "num_tokens": 753193.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 46.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0273737870156765, "kl": 0.0009900123986881226, "learning_rate": 2.4903333333333335e-06, "loss": 0.0, "num_tokens": 753511.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 46.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.010336839593946934, "kl": 0.009016639087349176, "learning_rate": 2.49e-06, "loss": 0.0005, "num_tokens": 753783.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 46.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.11457367241382599, "kl": 0.010811601998284459, "learning_rate": 2.4896666666666667e-06, "loss": 0.0006, "num_tokens": 754080.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 46.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.02475919760763645, "kl": 0.008598325541242957, "learning_rate": 2.4893333333333334e-06, "loss": 0.0004, "num_tokens": 754384.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 46.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0379536934196949, "kl": 0.044369883835315704, "learning_rate": 2.4890000000000002e-06, "loss": 0.0022, "num_tokens": 754788.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 46.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04465353861451149, "kl": 0.001675780862569809, "learning_rate": 2.488666666666667e-06, "loss": 0.0001, "num_tokens": 755048.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 46.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1090872511267662, "kl": 0.005680212285369635, "learning_rate": 2.4883333333333334e-06, "loss": 0.0003, "num_tokens": 755261.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 46.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.7724571228027344, "kl": 0.0728769488632679, "learning_rate": 2.488e-06, "loss": 0.0889, "num_tokens": 755613.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 47.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.3414354920387268, "kl": 0.02498954487964511, "learning_rate": 2.4876666666666665e-06, "loss": 0.0012, "num_tokens": 755848.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 47.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.039831146597862244, "kl": 0.09290435910224915, "learning_rate": 2.4873333333333333e-06, "loss": 0.0046, "num_tokens": 756212.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 47.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.031223390251398087, "kl": 0.0023416792973876, "learning_rate": 2.487e-06, "loss": 0.0001, "num_tokens": 756482.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.1879866123199463, "kl": 0.5608605849556625, "learning_rate": 2.486666666666667e-06, "loss": 0.0271, "num_tokens": 756774.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 47.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.007637239061295986, "kl": 0.0013469458208419383, "learning_rate": 2.4863333333333332e-06, "loss": 0.0001, "num_tokens": 756994.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 47.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 16.00135612487793, "kl": 0.01819664239883423, "learning_rate": 2.486e-06, "loss": 0.0077, "num_tokens": 757202.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 47.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.1345706433057785, "kl": 0.007921293145045638, "learning_rate": 2.4856666666666668e-06, "loss": 0.0005, "num_tokens": 757548.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 47.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.017033517360687256, "kl": 0.0006393283547367901, "learning_rate": 2.485333333333333e-06, "loss": 0.0, "num_tokens": 757816.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 47.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.010561208240687847, "kl": 0.0025972798466682434, "learning_rate": 2.4850000000000003e-06, "loss": 0.0001, "num_tokens": 758032.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 47.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 4.55400276184082, "kl": 0.024325484409928322, "learning_rate": 2.4846666666666667e-06, "loss": 0.052, "num_tokens": 758360.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 47.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022138182539492846, "kl": 0.0003915958950528875, "learning_rate": 2.4843333333333335e-06, "loss": 0.0, "num_tokens": 758672.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 47.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.5684993267059326, "kl": 0.11294364742934704, "learning_rate": 2.484e-06, "loss": 0.006, "num_tokens": 758982.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 47.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.007175232283771038, "kl": 0.0003366991877555847, "learning_rate": 2.4836666666666666e-06, "loss": 0.0, "num_tokens": 759226.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2550 }, { "clip_ratio/high_max": 0.009999999776482582, "clip_ratio/high_mean": 0.009999999776482582, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009999999776482582, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 47.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.513923406600952, "kl": 0.05781305208802223, "learning_rate": 2.4833333333333334e-06, "loss": 0.0348, "num_tokens": 759558.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 47.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 2.342684030532837, "kl": 0.39348478708416224, "learning_rate": 2.483e-06, "loss": 0.0149, "num_tokens": 759923.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 47.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.04920942336320877, "kl": 0.15138857811689377, "learning_rate": 2.482666666666667e-06, "loss": 0.0076, "num_tokens": 760233.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2553 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.01666666753590107, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01666666753590107, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 47.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.717015266418457, "kl": 0.0757996179163456, "learning_rate": 2.4823333333333333e-06, "loss": 0.016, "num_tokens": 760554.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 47.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.03699135035276413, "kl": 0.00045265257358551025, "learning_rate": 2.482e-06, "loss": 0.0, "num_tokens": 760774.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 47.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 2.284163236618042, "kl": 0.04800225980579853, "learning_rate": 2.4816666666666665e-06, "loss": -0.0975, "num_tokens": 761119.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 47.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.06733513623476028, "kl": 0.0052927323267795146, "learning_rate": 2.4813333333333333e-06, "loss": 0.0003, "num_tokens": 761447.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 47.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0070976922288537025, "kl": 0.0002507045865058899, "learning_rate": 2.481e-06, "loss": 0.0, "num_tokens": 761659.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 47.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.1926703006029129, "kl": 0.021734744776040316, "learning_rate": 2.480666666666667e-06, "loss": 0.0011, "num_tokens": 761953.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.1924740970134735, "kl": 0.027753588743507862, "learning_rate": 2.480333333333333e-06, "loss": 0.0015, "num_tokens": 762235.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 47.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.016208812594413757, "kl": 0.002021443098783493, "learning_rate": 2.48e-06, "loss": 0.0001, "num_tokens": 762547.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 47.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05852117761969566, "kl": 0.010034375358372927, "learning_rate": 2.4796666666666668e-06, "loss": 0.0005, "num_tokens": 762868.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 47.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.275321006774902, "kl": 0.20327900350093842, "learning_rate": 2.4793333333333335e-06, "loss": 0.0253, "num_tokens": 763173.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 47.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.024304278194904327, "kl": 0.0016468060784973204, "learning_rate": 2.4790000000000003e-06, "loss": 0.0001, "num_tokens": 763453.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0864797979593277, "kl": 0.020508273504674435, "learning_rate": 2.4786666666666667e-06, "loss": 0.001, "num_tokens": 763726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 47.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.4386816620826721, "kl": 0.06215875409543514, "learning_rate": 2.4783333333333335e-06, "loss": 0.0031, "num_tokens": 764131.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 47.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.4438295364379883, "kl": 0.47084836941212416, "learning_rate": 2.478e-06, "loss": 0.036, "num_tokens": 764466.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 47.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.007470057345926762, "kl": 0.0007480502245016396, "learning_rate": 2.4776666666666666e-06, "loss": 0.0, "num_tokens": 764726.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 47.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.03928351774811745, "kl": 0.007036558818072081, "learning_rate": 2.4773333333333334e-06, "loss": 0.0004, "num_tokens": 764998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 47.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.007212792988866568, "kl": 5.517899990081787e-05, "learning_rate": 2.477e-06, "loss": 0.0, "num_tokens": 765210.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 47.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 1.9965059757232666, "kl": 0.035281239077448845, "learning_rate": 2.476666666666667e-06, "loss": 0.0476, "num_tokens": 765576.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 47.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.00748828612267971, "kl": 0.015323393978178501, "learning_rate": 2.4763333333333333e-06, "loss": 0.0008, "num_tokens": 765836.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 47.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 2.799642562866211, "kl": 0.06395893171429634, "learning_rate": 2.476e-06, "loss": -0.0077, "num_tokens": 766177.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 47.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.20798420906066895, "kl": 0.020984972827136517, "learning_rate": 2.4756666666666665e-06, "loss": 0.0011, "num_tokens": 766467.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 47.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.013636157847940922, "kl": 0.0006756596267223358, "learning_rate": 2.4753333333333332e-06, "loss": 0.0, "num_tokens": 766727.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 47.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.8445509672164917, "kl": 0.09059044159948826, "learning_rate": 2.475e-06, "loss": 0.0049, "num_tokens": 767003.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 47.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017572080250829458, "kl": 0.0007877677562646568, "learning_rate": 2.474666666666667e-06, "loss": 0.0, "num_tokens": 767263.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 47.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.777828216552734, "kl": 0.015769362449645996, "learning_rate": 2.4743333333333336e-06, "loss": -0.0103, "num_tokens": 767589.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 47.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.050758421421051025, "kl": 0.006462006596848369, "learning_rate": 2.474e-06, "loss": 0.0003, "num_tokens": 767893.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.011461479589343071, "kl": 0.008317825384438038, "learning_rate": 2.4736666666666667e-06, "loss": 0.0004, "num_tokens": 768165.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 47.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08216534554958344, "kl": 0.0032526047143619508, "learning_rate": 2.4733333333333335e-06, "loss": 0.0002, "num_tokens": 768421.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 47.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.010482510551810265, "kl": 0.006229990627616644, "learning_rate": 2.4730000000000003e-06, "loss": 0.0003, "num_tokens": 768710.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.01895684376358986, "kl": 0.0023190357023850083, "learning_rate": 2.4726666666666667e-06, "loss": 0.0001, "num_tokens": 768994.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 47.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.04171139374375343, "kl": 0.002461825031787157, "learning_rate": 2.4723333333333334e-06, "loss": 0.0001, "num_tokens": 769230.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 47.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.007956587709486485, "kl": 0.0022862255573272705, "learning_rate": 2.472e-06, "loss": 0.0001, "num_tokens": 769466.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 47.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.120576873421669, "kl": 0.02623407356441021, "learning_rate": 2.4716666666666666e-06, "loss": 0.0013, "num_tokens": 769798.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.1029219850897789, "kl": 0.015235194936394691, "learning_rate": 2.4713333333333334e-06, "loss": 0.0006, "num_tokens": 770090.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 47.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.04186006635427475, "kl": 0.0017276121652685106, "learning_rate": 2.471e-06, "loss": 0.0001, "num_tokens": 770386.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 47.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05727760121226311, "kl": 0.014936825260519981, "learning_rate": 2.470666666666667e-06, "loss": 0.0007, "num_tokens": 770646.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 47.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.036273978650569916, "kl": 0.007272610906511545, "learning_rate": 2.4703333333333333e-06, "loss": 0.0004, "num_tokens": 770916.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 47.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.12503013014793396, "kl": 0.021614138036966324, "learning_rate": 2.47e-06, "loss": 0.0011, "num_tokens": 771216.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 48.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010718362173065543, "kl": 0.0012030623038299382, "learning_rate": 2.4696666666666664e-06, "loss": 0.0001, "num_tokens": 771496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 48.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038712776731699705, "kl": 0.0004808281664736569, "learning_rate": 2.4693333333333336e-06, "loss": 0.0, "num_tokens": 771768.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 48.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 8.828770637512207, "kl": 0.10881831945152953, "learning_rate": 2.469e-06, "loss": -0.0426, "num_tokens": 772026.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 48.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.00795852392911911, "kl": 0.0023187175393104553, "learning_rate": 2.4686666666666668e-06, "loss": 0.0001, "num_tokens": 772262.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 48.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.06079699099063873, "kl": 0.010363928508013487, "learning_rate": 2.4683333333333336e-06, "loss": 0.0005, "num_tokens": 772566.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.012865503318607807, "kl": 0.007851775735616684, "learning_rate": 2.468e-06, "loss": 0.0004, "num_tokens": 772838.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.24803395569324493, "kl": 0.035253395326435566, "learning_rate": 2.4676666666666667e-06, "loss": 0.0019, "num_tokens": 773119.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 48.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.03238431736826897, "kl": 0.00431145797483623, "learning_rate": 2.4673333333333335e-06, "loss": 0.0002, "num_tokens": 773390.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 48.148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 137.25222778320312, "kl": 10.027743226848543, "learning_rate": 2.4670000000000003e-06, "loss": 0.5305, "num_tokens": 773681.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 48.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 5.340005397796631, "kl": 0.0995195247232914, "learning_rate": 2.4666666666666666e-06, "loss": 0.0852, "num_tokens": 773979.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.23584656417369843, "kl": 0.04996185004711151, "learning_rate": 2.4663333333333334e-06, "loss": 0.0027, "num_tokens": 774299.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.2928851544857025, "kl": 0.026695653796195984, "learning_rate": 2.4659999999999998e-06, "loss": 0.0014, "num_tokens": 774585.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.3870152533054352, "kl": 0.05899230018258095, "learning_rate": 2.4656666666666666e-06, "loss": 0.0029, "num_tokens": 774873.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 48.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.11196646094322205, "kl": 0.020365355536341667, "learning_rate": 2.4653333333333338e-06, "loss": 0.001, "num_tokens": 775210.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 48.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.006418376229703426, "kl": 0.0004412122070789337, "learning_rate": 2.465e-06, "loss": 0.0, "num_tokens": 775454.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 48.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.012934365309774876, "kl": 0.014251023530960083, "learning_rate": 2.464666666666667e-06, "loss": 0.0007, "num_tokens": 775714.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 48.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.019527660682797432, "kl": 0.0028348437044769526, "learning_rate": 2.4643333333333333e-06, "loss": 0.0001, "num_tokens": 775994.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 48.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.006592772901058197, "kl": 0.0004989169538021088, "learning_rate": 2.464e-06, "loss": 0.0, "num_tokens": 776254.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 48.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.03467896953225136, "kl": 0.007223922293633223, "learning_rate": 2.463666666666667e-06, "loss": 0.0004, "num_tokens": 776524.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 48.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.016406308859586716, "kl": 0.0010848395177163184, "learning_rate": 2.4633333333333336e-06, "loss": 0.0001, "num_tokens": 776849.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 48.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.047961898148059845, "kl": 0.002674810995813459, "learning_rate": 2.463e-06, "loss": 0.0001, "num_tokens": 777123.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 48.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 2.209963321685791, "kl": 0.04117728769779205, "learning_rate": 2.4626666666666667e-06, "loss": -0.035, "num_tokens": 777483.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 48.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.03381167724728584, "kl": 0.019356227945536375, "learning_rate": 2.4623333333333335e-06, "loss": 0.001, "num_tokens": 777760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 48.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.05479121953248978, "kl": 0.00934884324669838, "learning_rate": 2.462e-06, "loss": 0.0005, "num_tokens": 778042.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 48.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.08005852997303009, "kl": 0.01333491737022996, "learning_rate": 2.4616666666666667e-06, "loss": 0.0007, "num_tokens": 778365.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 48.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07427375018596649, "kl": 0.0055592358112335205, "learning_rate": 2.4613333333333335e-06, "loss": 0.0003, "num_tokens": 778581.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 80.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 80.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 48.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.4323911666870117, "kl": 0.023786271922290325, "learning_rate": 2.4610000000000002e-06, "loss": 0.481, "num_tokens": 779127.0, "reward": 4.425000190734863, "reward_std": 3.9609551429748535, "rewards/reward_combined/mean": 4.425000190734863, "rewards/reward_combined/std": 3.9609553813934326, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 48.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.00801610667258501, "kl": 0.0007882237550802529, "learning_rate": 2.4606666666666666e-06, "loss": 0.0, "num_tokens": 779347.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 48.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003652064478956163, "kl": 2.3402273654937744e-05, "learning_rate": 2.4603333333333334e-06, "loss": 0.0, "num_tokens": 779567.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 48.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.588613986968994, "kl": 0.05327329598367214, "learning_rate": 2.4599999999999997e-06, "loss": 0.1191, "num_tokens": 779931.0, "reward": 3.625, "reward_std": 2.75, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 2.75, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 48.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.19643938541412354, "kl": 0.00969262095168233, "learning_rate": 2.4596666666666665e-06, "loss": 0.0005, "num_tokens": 780145.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 48.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.02397293969988823, "kl": 0.006498393137007952, "learning_rate": 2.4593333333333337e-06, "loss": 0.0003, "num_tokens": 780433.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 48.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.6208887100219727, "kl": 0.058538658544421196, "learning_rate": 2.459e-06, "loss": 0.0027, "num_tokens": 780763.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 48.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0708281397819519, "kl": 0.0025162369711324573, "learning_rate": 2.458666666666667e-06, "loss": 0.0001, "num_tokens": 781019.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 48.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.010458112694323063, "kl": 0.0020992299541831017, "learning_rate": 2.4583333333333332e-06, "loss": 0.0001, "num_tokens": 781331.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.06427004933357239, "kl": 0.003297789953649044, "learning_rate": 2.458e-06, "loss": 0.0002, "num_tokens": 781598.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 48.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 3.400606393814087, "kl": 0.09866466373205185, "learning_rate": 2.457666666666667e-06, "loss": 0.0915, "num_tokens": 781954.0, "reward": 5.5, "reward_std": 2.309401035308838, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 2.309401035308838, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 48.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.2850549817085266, "kl": 0.03583748638629913, "learning_rate": 2.4573333333333336e-06, "loss": 0.0017, "num_tokens": 782160.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 48.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.012169628404080868, "kl": 0.2668505907058716, "learning_rate": 2.457e-06, "loss": 0.0133, "num_tokens": 782464.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 48.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.007750585209578276, "kl": 0.0011209641816094518, "learning_rate": 2.4566666666666667e-06, "loss": 0.0001, "num_tokens": 782760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 48.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.010310211218893528, "kl": 0.00013162195682525635, "learning_rate": 2.4563333333333335e-06, "loss": 0.0, "num_tokens": 782972.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 48.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.9040417671203613, "kl": 0.06567835807800293, "learning_rate": 2.456e-06, "loss": -0.0099, "num_tokens": 783320.0, "reward": 5.375, "reward_std": 2.75, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.75, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 48.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.017361639067530632, "kl": 0.0015062980819493532, "learning_rate": 2.4556666666666666e-06, "loss": 0.0001, "num_tokens": 783555.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2634 }, { "clip_ratio/high_max": 0.007575757801532745, "clip_ratio/high_mean": 0.007575757801532745, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007575757801532745, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 48.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.701044082641602, "kl": 0.05855248123407364, "learning_rate": 2.4553333333333334e-06, "loss": 0.0993, "num_tokens": 783917.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 48.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.03841494768857956, "kl": 0.002091391012072563, "learning_rate": 2.4550000000000002e-06, "loss": 0.0001, "num_tokens": 784233.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 48.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.12177629768848419, "kl": 0.05128871090710163, "learning_rate": 2.4546666666666666e-06, "loss": 0.0026, "num_tokens": 784531.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 48.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.09979604929685593, "kl": 0.10168707370758057, "learning_rate": 2.4543333333333334e-06, "loss": 0.0051, "num_tokens": 784898.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 48.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.1377181112766266, "kl": 0.015214312821626663, "learning_rate": 2.4539999999999997e-06, "loss": 0.0008, "num_tokens": 785156.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 48.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.017956532537937164, "kl": 0.04019933193922043, "learning_rate": 2.453666666666667e-06, "loss": 0.002, "num_tokens": 785561.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 48.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.8203216791152954, "kl": 0.13380496203899384, "learning_rate": 2.4533333333333337e-06, "loss": 0.0056, "num_tokens": 785877.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 48.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.04856766760349274, "kl": 0.12611490115523338, "learning_rate": 2.453e-06, "loss": 0.0063, "num_tokens": 786187.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 48.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.00923575833439827, "kl": 0.00044141808757558465, "learning_rate": 2.452666666666667e-06, "loss": 0.0, "num_tokens": 786504.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 48.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02838926948606968, "kl": 0.005278926342725754, "learning_rate": 2.452333333333333e-06, "loss": 0.0003, "num_tokens": 786808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 48.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07370824366807938, "kl": 0.01963486336171627, "learning_rate": 2.452e-06, "loss": 0.001, "num_tokens": 787128.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 49.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08243677020072937, "kl": 0.006912033539265394, "learning_rate": 2.4516666666666668e-06, "loss": 0.0004, "num_tokens": 787392.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 49.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01931563951075077, "kl": 0.0012164450890850276, "learning_rate": 2.4513333333333336e-06, "loss": 0.0001, "num_tokens": 787706.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 49.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.0468934774398804, "kl": 0.06819241680204868, "learning_rate": 2.451e-06, "loss": -0.0331, "num_tokens": 788107.0, "reward": 1.75, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.4433757066726685, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.504814088344574, "kl": 0.06039337324909866, "learning_rate": 2.4506666666666667e-06, "loss": 0.003, "num_tokens": 788391.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 49.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.07930750399827957, "kl": 0.010783094447106123, "learning_rate": 2.4503333333333335e-06, "loss": 0.0006, "num_tokens": 788649.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 49.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.02539624646306038, "kl": 0.0014381707296706736, "learning_rate": 2.45e-06, "loss": 0.0001, "num_tokens": 788917.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 49.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008240375318564475, "kl": 0.001209248322993517, "learning_rate": 2.449666666666667e-06, "loss": 0.0001, "num_tokens": 789197.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 49.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.39711856842041016, "kl": 0.02422859240323305, "learning_rate": 2.4493333333333334e-06, "loss": 0.0012, "num_tokens": 789532.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.1310444474220276, "kl": 0.013972847256809473, "learning_rate": 2.449e-06, "loss": 0.0006, "num_tokens": 789824.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 49.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.00037588595296256244, "kl": 2.4981796741485596e-05, "learning_rate": 2.4486666666666665e-06, "loss": 0.0, "num_tokens": 790044.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 49.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.07478267699480057, "kl": 0.013302026316523552, "learning_rate": 2.4483333333333333e-06, "loss": 0.0007, "num_tokens": 790368.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 49.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.013030369766056538, "kl": 0.0034678855445235968, "learning_rate": 2.448e-06, "loss": 0.0002, "num_tokens": 790656.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2657 }, { "clip_ratio/high_max": 0.014285714365541935, "clip_ratio/high_mean": 0.014285714365541935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014285714365541935, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.099160194396973, "kl": 0.04462042637169361, "learning_rate": 2.447666666666667e-06, "loss": 0.0722, "num_tokens": 790963.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 49.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10959002375602722, "kl": 0.0050025584059767425, "learning_rate": 2.4473333333333337e-06, "loss": 0.0003, "num_tokens": 791196.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 49.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.008650427684187889, "kl": 0.0024437233805656433, "learning_rate": 2.447e-06, "loss": 0.0001, "num_tokens": 791412.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 49.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.02097531035542488, "kl": 0.00032559634564677253, "learning_rate": 2.446666666666667e-06, "loss": 0.0, "num_tokens": 791668.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 49.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.12095050513744354, "kl": 0.019461162388324738, "learning_rate": 2.446333333333333e-06, "loss": 0.001, "num_tokens": 791967.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 49.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.03256397321820259, "kl": 0.002429179206956178, "learning_rate": 2.446e-06, "loss": 0.0001, "num_tokens": 792263.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 49.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.036686867475509644, "kl": 0.007383710239082575, "learning_rate": 2.4456666666666667e-06, "loss": 0.0004, "num_tokens": 792556.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 49.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 3.2952873706817627, "kl": 0.5442184414714575, "learning_rate": 2.4453333333333335e-06, "loss": -0.1153, "num_tokens": 792886.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 49.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037277434021234512, "kl": 0.0004651211202144623, "learning_rate": 2.445e-06, "loss": 0.0, "num_tokens": 793146.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 49.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.008202599361538887, "kl": 0.0007944583776406944, "learning_rate": 2.4446666666666667e-06, "loss": 0.0, "num_tokens": 793366.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 49.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.034822553396224976, "kl": 0.003939538088161498, "learning_rate": 2.4443333333333334e-06, "loss": 0.0002, "num_tokens": 793700.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 49.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.08850719779729843, "kl": 0.009072621120139956, "learning_rate": 2.444e-06, "loss": 0.0005, "num_tokens": 793950.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.10264670848846436, "kl": 0.0181496012955904, "learning_rate": 2.443666666666667e-06, "loss": 0.0009, "num_tokens": 794220.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 49.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.010163417086005211, "kl": 0.0017835497856140137, "learning_rate": 2.4433333333333334e-06, "loss": 0.0001, "num_tokens": 794456.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 49.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.12668950855731964, "kl": 0.035411637276411057, "learning_rate": 2.443e-06, "loss": 0.0017, "num_tokens": 794820.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 49.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04296569526195526, "kl": 0.0030661100754514337, "learning_rate": 2.4426666666666665e-06, "loss": 0.0001, "num_tokens": 795034.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 49.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01004098355770111, "kl": 0.00011940300464630127, "learning_rate": 2.4423333333333333e-06, "loss": 0.0, "num_tokens": 795246.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 49.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.739889621734619, "kl": 0.020626836456358433, "learning_rate": 2.442e-06, "loss": 0.1356, "num_tokens": 795595.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.18879079818725586, "kl": 0.03710257029160857, "learning_rate": 2.441666666666667e-06, "loss": 0.0018, "num_tokens": 795884.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 49.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.03499231114983559, "kl": 0.012018134817481041, "learning_rate": 2.4413333333333336e-06, "loss": 0.0006, "num_tokens": 796145.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.05656729266047478, "kl": 0.012749699875712395, "learning_rate": 2.441e-06, "loss": 0.0006, "num_tokens": 796434.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.010416666977107525, "clip_ratio/region_mean": 0.010416666977107525, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 49.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 4.339295387268066, "kl": 0.02863692305982113, "learning_rate": 2.440666666666667e-06, "loss": -0.029, "num_tokens": 796758.0, "reward": 2.125, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 1.6007810831069946, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 49.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 2.1148946285247803, "kl": 0.1171901561319828, "learning_rate": 2.440333333333333e-06, "loss": -0.0404, "num_tokens": 797106.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 49.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 3.1947810649871826, "kl": 0.078176059294492, "learning_rate": 2.44e-06, "loss": 0.0036, "num_tokens": 797392.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 49.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.19639906287193298, "kl": 0.02894734777510166, "learning_rate": 2.4396666666666667e-06, "loss": 0.0016, "num_tokens": 797672.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 49.68518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 2.8781650066375732, "kl": 0.05732029862701893, "learning_rate": 2.4393333333333335e-06, "loss": -0.1077, "num_tokens": 798016.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08665555715560913, "kl": 0.036981672048568726, "learning_rate": 2.439e-06, "loss": 0.0018, "num_tokens": 798356.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.037065740674734116, "kl": 0.001828221109462902, "learning_rate": 2.4386666666666666e-06, "loss": 0.0001, "num_tokens": 798635.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 49.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.030324524268507957, "kl": 0.2642036974430084, "learning_rate": 2.4383333333333334e-06, "loss": 0.0132, "num_tokens": 798940.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 49.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021105341147631407, "kl": 0.0008045106951612979, "learning_rate": 2.438e-06, "loss": 0.0, "num_tokens": 799200.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 49.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.028025034815073013, "kl": 0.0010068500705529004, "learning_rate": 2.437666666666667e-06, "loss": 0.0001, "num_tokens": 799468.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 49.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.053984951227903366, "kl": 0.14922355860471725, "learning_rate": 2.4373333333333333e-06, "loss": 0.0075, "num_tokens": 799780.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 49.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.022466666996479034, "kl": 0.0025486857630312443, "learning_rate": 2.437e-06, "loss": 0.0001, "num_tokens": 800040.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 49.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.06602063775062561, "kl": 0.0004631355404853821, "learning_rate": 2.4366666666666665e-06, "loss": 0.0, "num_tokens": 800252.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 49.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.5515338778495789, "kl": 0.05331879248842597, "learning_rate": 2.4363333333333333e-06, "loss": 0.003, "num_tokens": 800531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 49.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0370035246014595, "kl": 0.015739089343696833, "learning_rate": 2.436e-06, "loss": 0.0007, "num_tokens": 800914.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.07922983169555664, "kl": 0.027185593266040087, "learning_rate": 2.435666666666667e-06, "loss": 0.0014, "num_tokens": 801202.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 49.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.11076068878173828, "kl": 0.01239091157913208, "learning_rate": 2.4353333333333336e-06, "loss": 0.0006, "num_tokens": 801470.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 49.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.06370322406291962, "kl": 0.08850816637277603, "learning_rate": 2.435e-06, "loss": 0.0044, "num_tokens": 801834.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 49.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.08588910847902298, "kl": 0.005389301746618003, "learning_rate": 2.4346666666666668e-06, "loss": 0.0003, "num_tokens": 802155.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 49.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09936283528804779, "kl": 0.013003773987293243, "learning_rate": 2.434333333333333e-06, "loss": 0.0007, "num_tokens": 802467.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.1194983720779419, "kl": 0.014567551203072071, "learning_rate": 2.434e-06, "loss": 0.0008, "num_tokens": 802771.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 50.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.7284114360809326, "kl": 0.041163988411426544, "learning_rate": 2.4336666666666667e-06, "loss": 0.0668, "num_tokens": 803107.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 50.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 8.619093894958496, "kl": 0.047781015920918435, "learning_rate": 2.4333333333333335e-06, "loss": 0.0001, "num_tokens": 803327.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 50.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01032946351915598, "kl": 0.26712600886821747, "learning_rate": 2.4330000000000003e-06, "loss": 0.0134, "num_tokens": 803631.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 50.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04903581738471985, "kl": 0.0033776217605918646, "learning_rate": 2.4326666666666666e-06, "loss": 0.0002, "num_tokens": 803958.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 50.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.024718619883060455, "kl": 0.003179234452545643, "learning_rate": 2.4323333333333334e-06, "loss": 0.0002, "num_tokens": 804270.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 50.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.06741975992918015, "kl": 0.01074890187010169, "learning_rate": 2.432e-06, "loss": 0.0005, "num_tokens": 804576.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.07277852296829224, "kl": 0.01764109404757619, "learning_rate": 2.431666666666667e-06, "loss": 0.0009, "num_tokens": 804850.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 50.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.07444789260625839, "kl": 0.008178258314728737, "learning_rate": 2.4313333333333333e-06, "loss": 0.0004, "num_tokens": 805167.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 50.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008052741177380085, "kl": 0.0012273192405700684, "learning_rate": 2.431e-06, "loss": 0.0001, "num_tokens": 805447.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 50.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.04898441210389137, "kl": 0.0042346930131316185, "learning_rate": 2.4306666666666665e-06, "loss": 0.0002, "num_tokens": 805683.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 50.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002621825260575861, "kl": 8.128583431243896e-06, "learning_rate": 2.4303333333333332e-06, "loss": 0.0, "num_tokens": 805903.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 50.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.677748680114746, "kl": 0.01190832769498229, "learning_rate": 2.43e-06, "loss": -0.036, "num_tokens": 806228.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 50.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.1671454906463623, "kl": 0.030420560389757156, "learning_rate": 2.429666666666667e-06, "loss": 0.037, "num_tokens": 806595.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 50.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.009926141239702702, "kl": 0.002172514796257019, "learning_rate": 2.4293333333333336e-06, "loss": 0.0001, "num_tokens": 806811.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 50.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.553558588027954, "kl": 0.5103753441944718, "learning_rate": 2.429e-06, "loss": 0.0613, "num_tokens": 807072.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 5.161747455596924, "kl": 0.07729346863925457, "learning_rate": 2.4286666666666667e-06, "loss": -0.1388, "num_tokens": 807374.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 50.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.65897798538208, "kl": 0.02129838801920414, "learning_rate": 2.428333333333333e-06, "loss": -0.0117, "num_tokens": 807707.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 50.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.016636377200484276, "kl": 0.0006402172148227692, "learning_rate": 2.4280000000000003e-06, "loss": 0.0, "num_tokens": 807967.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 50.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.2735231816768646, "kl": 0.057274749502539635, "learning_rate": 2.4276666666666667e-06, "loss": 0.0029, "num_tokens": 808283.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 50.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 7.809070587158203, "kl": 0.03625241667032242, "learning_rate": 2.4273333333333334e-06, "loss": 0.0365, "num_tokens": 808584.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.08341844379901886, "kl": 0.008286748547106981, "learning_rate": 2.4270000000000002e-06, "loss": 0.0004, "num_tokens": 808852.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 3.658259630203247, "kl": 0.052839044481515884, "learning_rate": 2.4266666666666666e-06, "loss": -0.179, "num_tokens": 809204.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 50.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.11819658428430557, "kl": 0.015196267049759626, "learning_rate": 2.4263333333333334e-06, "loss": 0.0008, "num_tokens": 809533.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 50.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 2.77386736869812, "kl": 0.0791231095790863, "learning_rate": 2.426e-06, "loss": 0.0951, "num_tokens": 809887.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 50.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.00891206320375204, "kl": 0.0021475031971931458, "learning_rate": 2.425666666666667e-06, "loss": 0.0001, "num_tokens": 810123.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 50.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.051420990377664566, "kl": 0.011426992248743773, "learning_rate": 2.4253333333333333e-06, "loss": 0.0006, "num_tokens": 810463.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 50.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.4433037042617798, "kl": 0.012402377324178815, "learning_rate": 2.425e-06, "loss": -0.0009, "num_tokens": 810751.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.020202970132231712, "kl": 0.005042759468778968, "learning_rate": 2.4246666666666664e-06, "loss": 0.0003, "num_tokens": 811021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 50.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.048094965517520905, "kl": 0.011728422716259956, "learning_rate": 2.4243333333333332e-06, "loss": 0.0006, "num_tokens": 811352.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2728 }, { "clip_ratio/high_max": 0.0017301038606092334, "clip_ratio/high_mean": 0.0017301038606092334, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017301038606092334, "completion_length": 86.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 86.75, "completions/mean_terminated_length": 30.33333396911621, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 50.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.9604570865631104, "kl": 0.025076637975871563, "learning_rate": 2.4240000000000004e-06, "loss": 0.387, "num_tokens": 811915.0, "reward": 2.375, "reward_std": 2.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 2.25, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 50.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.6494704484939575, "kl": 0.03713542781770229, "learning_rate": 2.4236666666666668e-06, "loss": -0.0793, "num_tokens": 812334.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 50.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.06151675805449486, "kl": 0.000841870903968811, "learning_rate": 2.4233333333333336e-06, "loss": 0.0, "num_tokens": 812546.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.08187728375196457, "kl": 0.007671724772080779, "learning_rate": 2.423e-06, "loss": 0.0004, "num_tokens": 812830.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 50.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.05948270112276077, "kl": 0.10078465938568115, "learning_rate": 2.4226666666666667e-06, "loss": 0.005, "num_tokens": 813198.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.08605901896953583, "kl": 0.006832719314843416, "learning_rate": 2.4223333333333335e-06, "loss": 0.0003, "num_tokens": 813466.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 50.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.09176906198263168, "kl": 0.006867297692224383, "learning_rate": 2.4220000000000003e-06, "loss": 0.0004, "num_tokens": 813728.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 50.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0165004413574934, "kl": 0.000317606347380206, "learning_rate": 2.4216666666666666e-06, "loss": 0.0, "num_tokens": 813984.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 50.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.07851296663284302, "kl": 0.013246928807348013, "learning_rate": 2.4213333333333334e-06, "loss": 0.0007, "num_tokens": 814245.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 50.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.245790958404541, "kl": 0.09969478845596313, "learning_rate": 2.421e-06, "loss": 0.0353, "num_tokens": 814619.0, "reward": 5.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.34165620803833, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 50.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.02559671737253666, "kl": 0.001442710228729993, "learning_rate": 2.4206666666666666e-06, "loss": 0.0001, "num_tokens": 814899.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 50.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.95418381690979, "kl": 0.036475375294685364, "learning_rate": 2.4203333333333333e-06, "loss": 0.0024, "num_tokens": 815109.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.010515622794628143, "kl": 0.008626106195151806, "learning_rate": 2.42e-06, "loss": 0.0004, "num_tokens": 815381.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 50.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.006234433501958847, "kl": 0.0008897421357687563, "learning_rate": 2.419666666666667e-06, "loss": 0.0, "num_tokens": 815643.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06533730030059814, "kl": 0.006466299062594771, "learning_rate": 2.4193333333333333e-06, "loss": 0.0003, "num_tokens": 815917.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.251893997192383, "kl": 0.025556170847266912, "learning_rate": 2.419e-06, "loss": -0.1101, "num_tokens": 816203.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 50.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.00700110150501132, "kl": 0.0005770400166511536, "learning_rate": 2.4186666666666664e-06, "loss": 0.0, "num_tokens": 816447.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 50.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 6.8227620124816895, "kl": 0.2471969798207283, "learning_rate": 2.418333333333333e-06, "loss": -0.0321, "num_tokens": 816757.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.30015987157821655, "kl": 0.02381989953573793, "learning_rate": 2.4180000000000004e-06, "loss": 0.0013, "num_tokens": 817055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 50.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.05472929775714874, "kl": 0.006903106113895774, "learning_rate": 2.4176666666666668e-06, "loss": 0.0003, "num_tokens": 817348.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.07999484241008759, "kl": 0.015777053777128458, "learning_rate": 2.4173333333333335e-06, "loss": 0.0008, "num_tokens": 817634.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 50.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0765211284160614, "kl": 0.015375125221908092, "learning_rate": 2.417e-06, "loss": 0.0008, "num_tokens": 818048.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 50.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.14390313625335693, "kl": 0.0063751935958862305, "learning_rate": 2.4166666666666667e-06, "loss": 0.0005, "num_tokens": 818264.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 50.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.940622806549072, "kl": 0.03030684869736433, "learning_rate": 2.4163333333333335e-06, "loss": 0.0357, "num_tokens": 818575.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 50.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.018179886043071747, "kl": 0.0007749234209768474, "learning_rate": 2.4160000000000002e-06, "loss": 0.0, "num_tokens": 818843.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 51.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08896339684724808, "kl": 0.010296330321580172, "learning_rate": 2.4156666666666666e-06, "loss": 0.0005, "num_tokens": 819141.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 51.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 7.347787857055664, "kl": 0.05637666955590248, "learning_rate": 2.4153333333333334e-06, "loss": 0.0105, "num_tokens": 819460.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 51.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013472180580720305, "kl": 0.0012846568133682013, "learning_rate": 2.415e-06, "loss": 0.0001, "num_tokens": 819740.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 51.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03685910999774933, "kl": 0.006507994374260306, "learning_rate": 2.4146666666666665e-06, "loss": 0.0003, "num_tokens": 820042.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 51.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 6.2996296882629395, "kl": 0.018799642100930214, "learning_rate": 2.4143333333333333e-06, "loss": 0.2577, "num_tokens": 820340.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 51.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.42697465419769287, "kl": 0.04794200509786606, "learning_rate": 2.414e-06, "loss": 0.0036, "num_tokens": 820552.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 51.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.03810492530465126, "kl": 0.010631228797137737, "learning_rate": 2.413666666666667e-06, "loss": 0.0005, "num_tokens": 820878.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 51.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.33721089363098145, "kl": 0.0593524519354105, "learning_rate": 2.4133333333333332e-06, "loss": 0.0043, "num_tokens": 821183.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 51.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.04073914885520935, "kl": 0.0020348261459730566, "learning_rate": 2.413e-06, "loss": 0.0001, "num_tokens": 821455.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 51.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.00807228684425354, "kl": 0.26762712001800537, "learning_rate": 2.4126666666666664e-06, "loss": 0.0134, "num_tokens": 821759.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 51.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.01383829303085804, "kl": 0.0006056129932403564, "learning_rate": 2.4123333333333336e-06, "loss": 0.0, "num_tokens": 822019.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 51.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.002749810926616192, "kl": 0.016368752345442772, "learning_rate": 2.4120000000000004e-06, "loss": 0.0008, "num_tokens": 822279.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 51.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.10478509962558746, "kl": 0.007327968487516046, "learning_rate": 2.4116666666666667e-06, "loss": 0.0004, "num_tokens": 822606.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 51.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.27007794380188, "kl": 0.07198414951562881, "learning_rate": 2.4113333333333335e-06, "loss": 0.0572, "num_tokens": 822899.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 51.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.100656032562256, "kl": 0.03164546750485897, "learning_rate": 2.411e-06, "loss": 0.1144, "num_tokens": 823180.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 51.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.044763192534446716, "kl": 0.0057826959528028965, "learning_rate": 2.4106666666666667e-06, "loss": 0.0003, "num_tokens": 823470.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 51.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.564068078994751, "kl": 0.025499539449810982, "learning_rate": 2.4103333333333334e-06, "loss": -0.1491, "num_tokens": 823821.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 51.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.026910219341516495, "kl": 0.005641356110572815, "learning_rate": 2.4100000000000002e-06, "loss": 0.0003, "num_tokens": 824109.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 51.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 2.387589693069458, "kl": 0.10334932431578636, "learning_rate": 2.4096666666666666e-06, "loss": 0.1159, "num_tokens": 824454.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 51.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 8.524815559387207, "kl": 0.09898410178720951, "learning_rate": 2.4093333333333334e-06, "loss": -0.0125, "num_tokens": 824731.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 51.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.06173790991306305, "kl": 0.001071445643901825, "learning_rate": 2.409e-06, "loss": 0.0001, "num_tokens": 824944.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 51.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.05020973086357117, "kl": 0.0029343462083488703, "learning_rate": 2.4086666666666665e-06, "loss": 0.0001, "num_tokens": 825187.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 51.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.13685880601406097, "kl": 0.024513863027095795, "learning_rate": 2.4083333333333337e-06, "loss": 0.0012, "num_tokens": 825531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 51.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.03583168238401413, "kl": 0.0018282131059095263, "learning_rate": 2.408e-06, "loss": 0.0001, "num_tokens": 825828.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 51.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05771275609731674, "kl": 0.004931454313918948, "learning_rate": 2.407666666666667e-06, "loss": 0.0003, "num_tokens": 826144.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 51.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.123723983764648, "kl": 0.09181689098477364, "learning_rate": 2.407333333333333e-06, "loss": 0.0424, "num_tokens": 826483.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 51.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.010458242148160934, "kl": 0.0023326099617406726, "learning_rate": 2.407e-06, "loss": 0.0001, "num_tokens": 826703.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 51.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.01242623571306467, "kl": 0.0002751588763203472, "learning_rate": 2.4066666666666668e-06, "loss": 0.0, "num_tokens": 826959.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 51.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0755714550614357, "kl": 0.02212864439934492, "learning_rate": 2.4063333333333336e-06, "loss": 0.0011, "num_tokens": 827249.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 51.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 2.0595531463623047, "kl": 0.2190406396985054, "learning_rate": 2.4060000000000003e-06, "loss": 0.011, "num_tokens": 827563.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 51.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.09541331231594086, "kl": 0.011347746010869741, "learning_rate": 2.4056666666666667e-06, "loss": 0.0006, "num_tokens": 827896.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 51.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031813124660402536, "kl": 0.001807287335395813, "learning_rate": 2.4053333333333335e-06, "loss": 0.0001, "num_tokens": 828208.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 51.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.009273199364542961, "kl": 0.0021372660994529724, "learning_rate": 2.405e-06, "loss": 0.0001, "num_tokens": 828444.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 51.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.017374370247125626, "kl": 0.0016625404241494834, "learning_rate": 2.4046666666666666e-06, "loss": 0.0001, "num_tokens": 828704.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 51.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.017382580786943436, "kl": 0.006710775662213564, "learning_rate": 2.4043333333333334e-06, "loss": 0.0004, "num_tokens": 829008.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 51.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 11.108415603637695, "kl": 0.06864787393715233, "learning_rate": 2.404e-06, "loss": 0.0355, "num_tokens": 829279.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 51.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.13853999972343445, "kl": 0.0072122784331440926, "learning_rate": 2.4036666666666666e-06, "loss": 0.0004, "num_tokens": 829506.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 51.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.055810511112213135, "kl": 0.15573750436306, "learning_rate": 2.4033333333333333e-06, "loss": 0.0078, "num_tokens": 829816.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 51.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1302904188632965, "kl": 0.007078180671669543, "learning_rate": 2.403e-06, "loss": 0.0004, "num_tokens": 830032.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 51.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.14294971525669098, "kl": 0.024903587996959686, "learning_rate": 2.4026666666666665e-06, "loss": 0.0012, "num_tokens": 830382.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 51.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003949082165490836, "kl": 2.4043023586273193e-05, "learning_rate": 2.4023333333333337e-06, "loss": 0.0, "num_tokens": 830602.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 51.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.015595119446516037, "kl": 0.0028768021147698164, "learning_rate": 2.402e-06, "loss": 0.0001, "num_tokens": 830886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 51.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.057516466826200485, "kl": 0.09041155502200127, "learning_rate": 2.401666666666667e-06, "loss": 0.0045, "num_tokens": 831250.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 51.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.13289222121238708, "kl": 0.03229084413032979, "learning_rate": 2.401333333333333e-06, "loss": 0.0016, "num_tokens": 831539.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 51.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.006144684739410877, "kl": 0.0010871917475014925, "learning_rate": 2.401e-06, "loss": 0.0001, "num_tokens": 831799.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 57.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 51.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 2.723008632659912, "kl": 0.03888143226504326, "learning_rate": 2.4006666666666667e-06, "loss": 0.297, "num_tokens": 832243.0, "reward": 1.875, "reward_std": 3.25, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 3.25, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 51.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.03026449866592884, "kl": 0.0013829807576257735, "learning_rate": 2.4003333333333335e-06, "loss": 0.0001, "num_tokens": 832513.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 51.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.13052216172218323, "kl": 0.03593877051025629, "learning_rate": 2.4000000000000003e-06, "loss": 0.0019, "num_tokens": 832833.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 51.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 4.955604553222656, "kl": 0.06896631233394146, "learning_rate": 2.3996666666666667e-06, "loss": 0.1753, "num_tokens": 833196.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 51.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.029458703473210335, "kl": 0.006023412570357323, "learning_rate": 2.3993333333333335e-06, "loss": 0.0003, "num_tokens": 833466.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 51.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.8770244121551514, "kl": 0.10628250613808632, "learning_rate": 2.399e-06, "loss": 0.0066, "num_tokens": 833708.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 51.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.715651035308838, "kl": 0.10342739894986153, "learning_rate": 2.3986666666666666e-06, "loss": 0.039, "num_tokens": 834050.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 51.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06230156868696213, "kl": 0.006974156480282545, "learning_rate": 2.3983333333333334e-06, "loss": 0.0004, "num_tokens": 834380.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 51.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06352169811725616, "kl": 0.03610933106392622, "learning_rate": 2.398e-06, "loss": 0.0018, "num_tokens": 834787.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 52.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.14158853888511658, "kl": 0.009810572722926736, "learning_rate": 2.3976666666666665e-06, "loss": 0.0005, "num_tokens": 835054.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 52.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07737588882446289, "kl": 0.022331008221954107, "learning_rate": 2.3973333333333333e-06, "loss": 0.0011, "num_tokens": 835348.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 52.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0573759600520134, "kl": 0.005496953381225467, "learning_rate": 2.397e-06, "loss": 0.0003, "num_tokens": 835641.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 52.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.11601516604423523, "kl": 0.02744139451533556, "learning_rate": 2.396666666666667e-06, "loss": 0.0014, "num_tokens": 835944.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 52.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.008734889328479767, "kl": 0.0016702950233593583, "learning_rate": 2.3963333333333337e-06, "loss": 0.0001, "num_tokens": 836164.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 52.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.006803820841014385, "kl": 0.0005280971527099609, "learning_rate": 2.396e-06, "loss": 0.0, "num_tokens": 836424.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 52.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 17.146289825439453, "kl": 0.012594989500939846, "learning_rate": 2.395666666666667e-06, "loss": 0.291, "num_tokens": 836670.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 52.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.07020723074674606, "kl": 0.002377644181251526, "learning_rate": 2.395333333333333e-06, "loss": 0.0001, "num_tokens": 836914.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 52.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.056235987693071365, "kl": 0.005837368196807802, "learning_rate": 2.395e-06, "loss": 0.0003, "num_tokens": 837245.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 52.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.016951628029346466, "kl": 0.004820444737561047, "learning_rate": 2.3946666666666667e-06, "loss": 0.0002, "num_tokens": 837533.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 52.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.16498813033103943, "kl": 0.05341300368309021, "learning_rate": 2.3943333333333335e-06, "loss": 0.0027, "num_tokens": 837834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 52.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012449953937903047, "kl": 0.001224594481755048, "learning_rate": 2.3940000000000003e-06, "loss": 0.0001, "num_tokens": 838114.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 52.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.09871586412191391, "kl": 0.03253638092428446, "learning_rate": 2.3936666666666666e-06, "loss": 0.0016, "num_tokens": 838477.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 52.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.12448083609342575, "kl": 0.002673305571079254, "learning_rate": 2.3933333333333334e-06, "loss": 0.0001, "num_tokens": 838689.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 52.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.009484909474849701, "kl": 0.0004921044746879488, "learning_rate": 2.393e-06, "loss": 0.0, "num_tokens": 838924.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 52.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.005183587782084942, "kl": 0.0023222336312755942, "learning_rate": 2.3926666666666666e-06, "loss": 0.0001, "num_tokens": 839208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 52.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.015094439499080181, "kl": 0.0008409619040321559, "learning_rate": 2.3923333333333334e-06, "loss": 0.0, "num_tokens": 839480.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 52.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029992111958563328, "kl": 0.0016938485205173492, "learning_rate": 2.392e-06, "loss": 0.0001, "num_tokens": 839792.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 52.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.028094839304685593, "kl": 0.0018907834310084581, "learning_rate": 2.391666666666667e-06, "loss": 0.0001, "num_tokens": 840066.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 52.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.1068010926246643, "kl": 0.03681113198399544, "learning_rate": 2.3913333333333333e-06, "loss": 0.0018, "num_tokens": 840421.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 52.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.04650650545954704, "kl": 0.03493187949061394, "learning_rate": 2.391e-06, "loss": 0.0017, "num_tokens": 840737.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 52.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.06854855269193649, "kl": 0.017199012450873852, "learning_rate": 2.390666666666667e-06, "loss": 0.0008, "num_tokens": 841069.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 52.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.017151908949017525, "kl": 0.26613669097423553, "learning_rate": 2.3903333333333336e-06, "loss": 0.0133, "num_tokens": 841373.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 52.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067215291783213615, "kl": 0.0013058037147857249, "learning_rate": 2.39e-06, "loss": 0.0001, "num_tokens": 841633.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 52.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06889953464269638, "kl": 0.01987474039196968, "learning_rate": 2.3896666666666668e-06, "loss": 0.001, "num_tokens": 841915.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 52.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.028096316382288933, "kl": 0.002080070087686181, "learning_rate": 2.389333333333333e-06, "loss": 0.0001, "num_tokens": 842183.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 52.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09361562132835388, "kl": 0.0048256367444992065, "learning_rate": 2.389e-06, "loss": 0.0002, "num_tokens": 842395.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 52.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.009083788841962814, "kl": 0.0048482418060302734, "learning_rate": 2.3886666666666667e-06, "loss": 0.0002, "num_tokens": 842663.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 52.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07530864328145981, "kl": 0.1553303226828575, "learning_rate": 2.3883333333333335e-06, "loss": 0.0078, "num_tokens": 842974.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 52.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.027725214138627052, "kl": 0.004206315497867763, "learning_rate": 2.3880000000000003e-06, "loss": 0.0002, "num_tokens": 843276.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 52.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0725192129611969, "kl": 0.0014049112796783447, "learning_rate": 2.3876666666666666e-06, "loss": 0.0001, "num_tokens": 843532.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 52.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.011141877621412277, "kl": 0.0006590724806301296, "learning_rate": 2.3873333333333334e-06, "loss": 0.0, "num_tokens": 843852.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 52.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 1.5030946731567383, "kl": 0.07586564496159554, "learning_rate": 2.3869999999999998e-06, "loss": 0.0424, "num_tokens": 844188.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 52.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.03774819150567055, "kl": 0.007036800729110837, "learning_rate": 2.386666666666667e-06, "loss": 0.0003, "num_tokens": 844479.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 52.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.01741122081875801, "kl": 0.001554942165967077, "learning_rate": 2.3863333333333333e-06, "loss": 0.0001, "num_tokens": 844739.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 52.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031002017203718424, "kl": 0.01629612222313881, "learning_rate": 2.386e-06, "loss": 0.0008, "num_tokens": 844999.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 52.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.3425886332988739, "kl": 0.07018738985061646, "learning_rate": 2.385666666666667e-06, "loss": 0.0035, "num_tokens": 845406.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 52.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.33491626381874084, "kl": 0.028047680854797363, "learning_rate": 2.3853333333333333e-06, "loss": 0.0014, "num_tokens": 845618.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 52.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.15327498316764832, "kl": 0.06014397367835045, "learning_rate": 2.385e-06, "loss": 0.003, "num_tokens": 845967.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 52.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.7377099990844727, "kl": 0.06164982728660107, "learning_rate": 2.384666666666667e-06, "loss": -0.1307, "num_tokens": 846325.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 52.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 10.34040641784668, "kl": 0.047206103801727295, "learning_rate": 2.3843333333333336e-06, "loss": 0.0664, "num_tokens": 846604.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 52.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.619198799133301, "kl": 0.09781079739332199, "learning_rate": 2.384e-06, "loss": 0.0479, "num_tokens": 846941.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 52.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.06523977965116501, "kl": 0.08886785805225372, "learning_rate": 2.3836666666666667e-06, "loss": 0.0044, "num_tokens": 847305.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 52.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00035995987127535045, "kl": 5.9567391872406006e-05, "learning_rate": 2.383333333333333e-06, "loss": 0.0, "num_tokens": 847525.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 52.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.04773977771401405, "kl": 0.00898170773871243, "learning_rate": 2.383e-06, "loss": 0.0004, "num_tokens": 847797.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 52.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.03776170685887337, "kl": 0.010953939985483885, "learning_rate": 2.382666666666667e-06, "loss": 0.0005, "num_tokens": 848123.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 52.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.07799394428730011, "kl": 0.021837515902007, "learning_rate": 2.3823333333333335e-06, "loss": 0.0011, "num_tokens": 848411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 52.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.008788693696260452, "kl": 0.0017764195799827576, "learning_rate": 2.3820000000000002e-06, "loss": 0.0001, "num_tokens": 848627.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 52.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.08096367865800858, "kl": 0.00670659338356927, "learning_rate": 2.3816666666666666e-06, "loss": 0.0003, "num_tokens": 848941.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 52.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.1365240067243576, "kl": 0.006359230261296034, "learning_rate": 2.3813333333333334e-06, "loss": 0.0003, "num_tokens": 849237.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 52.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.017864925786852837, "kl": 0.007465390954166651, "learning_rate": 2.381e-06, "loss": 0.0004, "num_tokens": 849511.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.75, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 52.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.2301209270954132, "kl": 0.024494120851159096, "learning_rate": 2.380666666666667e-06, "loss": 0.0012, "num_tokens": 849926.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 52.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06736123561859131, "kl": 0.009781391359865665, "learning_rate": 2.3803333333333333e-06, "loss": 0.0005, "num_tokens": 850184.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 52.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02686416730284691, "kl": 0.005065662087872624, "learning_rate": 2.38e-06, "loss": 0.0003, "num_tokens": 850516.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.10447411984205246, "kl": 0.02131066471338272, "learning_rate": 2.379666666666667e-06, "loss": 0.0011, "num_tokens": 850804.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 53.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.2185179740190506, "kl": 0.02052814792841673, "learning_rate": 2.3793333333333332e-06, "loss": 0.0011, "num_tokens": 851080.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08419032394886017, "kl": 0.036733237095177174, "learning_rate": 2.379e-06, "loss": 0.0018, "num_tokens": 851352.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 53.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 5.869204521179199, "kl": 0.16254862397909164, "learning_rate": 2.378666666666667e-06, "loss": 0.0596, "num_tokens": 851665.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2865 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 53.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 4.463963985443115, "kl": 0.08973372355103493, "learning_rate": 2.3783333333333336e-06, "loss": -0.0053, "num_tokens": 851990.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.07112845778465271, "kl": 0.028711308652418666, "learning_rate": 2.378e-06, "loss": 0.0014, "num_tokens": 852278.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 53.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.05987062305212021, "kl": 0.007646935526281595, "learning_rate": 2.3776666666666667e-06, "loss": 0.0004, "num_tokens": 852584.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 53.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.03847935423254967, "kl": 0.0003005564212799072, "learning_rate": 2.377333333333333e-06, "loss": 0.0, "num_tokens": 852796.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 53.148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 2.799895763397217, "kl": 0.025639529339969158, "learning_rate": 2.377e-06, "loss": 0.255, "num_tokens": 853154.0, "reward": 6.050000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 6.050000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 53.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029934579506516457, "kl": 0.0018062489107251167, "learning_rate": 2.376666666666667e-06, "loss": 0.0001, "num_tokens": 853466.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 53.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 4.199644565582275, "kl": 0.11968174437060952, "learning_rate": 2.3763333333333334e-06, "loss": 0.1412, "num_tokens": 853790.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 53.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10542991757392883, "kl": 0.039005622267723083, "learning_rate": 2.376e-06, "loss": 0.0021, "num_tokens": 854158.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 53.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.2771737575531006, "kl": 0.06000687135383487, "learning_rate": 2.3756666666666666e-06, "loss": 0.0324, "num_tokens": 854477.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 53.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.010056917555630207, "kl": 0.0006262307288125157, "learning_rate": 2.3753333333333333e-06, "loss": 0.0, "num_tokens": 854747.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 53.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04362093284726143, "kl": 0.006896126549690962, "learning_rate": 2.375e-06, "loss": 0.0003, "num_tokens": 855035.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 53.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 8.952740669250488, "kl": 0.015314777381718159, "learning_rate": 2.374666666666667e-06, "loss": 0.0794, "num_tokens": 855371.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 53.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.011432938277721405, "kl": 0.001405414892360568, "learning_rate": 2.3743333333333333e-06, "loss": 0.0001, "num_tokens": 855631.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 53.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 3.015829563140869, "kl": 0.10992373898625374, "learning_rate": 2.374e-06, "loss": -0.0708, "num_tokens": 856001.0, "reward": 6.625, "reward_std": 2.428133726119995, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.428133726119995, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 53.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0546252615749836, "kl": 0.0028338336560409516, "learning_rate": 2.373666666666667e-06, "loss": 0.0001, "num_tokens": 856235.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.048110343515872955, "kl": 0.009807596215978265, "learning_rate": 2.373333333333333e-06, "loss": 0.0005, "num_tokens": 856539.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 53.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.08813687413930893, "kl": 0.034528578631579876, "learning_rate": 2.373e-06, "loss": 0.0017, "num_tokens": 856875.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 53.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.004411724396049976, "kl": 0.016107586212456226, "learning_rate": 2.3726666666666668e-06, "loss": 0.0008, "num_tokens": 857135.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 53.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.162947654724121, "kl": 0.06655381433665752, "learning_rate": 2.3723333333333335e-06, "loss": -0.0039, "num_tokens": 857538.0, "reward": 2.25, "reward_std": 1.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.5, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 53.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.13354608416557312, "kl": 0.007072292268276215, "learning_rate": 2.372e-06, "loss": 0.0003, "num_tokens": 857748.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 53.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06629502773284912, "kl": 0.0019564018584787846, "learning_rate": 2.3716666666666667e-06, "loss": 0.0001, "num_tokens": 858044.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 53.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0499211922287941, "kl": 0.0028888892848044634, "learning_rate": 2.371333333333333e-06, "loss": 0.0001, "num_tokens": 858287.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 53.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.012173829600214958, "kl": 0.05078907683491707, "learning_rate": 2.3710000000000003e-06, "loss": 0.0025, "num_tokens": 858619.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 53.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.867785453796387, "kl": 0.09830822050571442, "learning_rate": 2.370666666666667e-06, "loss": 0.4734, "num_tokens": 858926.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 53.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05336439609527588, "kl": 0.006315299076959491, "learning_rate": 2.3703333333333334e-06, "loss": 0.0003, "num_tokens": 859222.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 53.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06559010595083237, "kl": 0.00980394147336483, "learning_rate": 2.37e-06, "loss": 0.0005, "num_tokens": 859496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 6.212857246398926, "kl": 0.029517007991671562, "learning_rate": 2.3696666666666665e-06, "loss": 0.1779, "num_tokens": 859782.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 53.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014080351684242487, "kl": 0.0013441123301163316, "learning_rate": 2.3693333333333333e-06, "loss": 0.0001, "num_tokens": 860062.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 53.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.16058064997196198, "kl": 0.008736562798731029, "learning_rate": 2.369e-06, "loss": 0.0004, "num_tokens": 860318.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 53.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 12.153318405151367, "kl": 0.01882866397500038, "learning_rate": 2.368666666666667e-06, "loss": 0.0026, "num_tokens": 860578.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 53.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003658926289062947, "kl": 6.495416164398193e-05, "learning_rate": 2.3683333333333332e-06, "loss": 0.0, "num_tokens": 860798.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 53.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.07831844687461853, "kl": 0.0020904242992401123, "learning_rate": 2.368e-06, "loss": 0.0002, "num_tokens": 861014.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 53.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.02326568402349949, "kl": 0.09351816400885582, "learning_rate": 2.367666666666667e-06, "loss": 0.0047, "num_tokens": 861380.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.68518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 5.987786293029785, "kl": 0.3220704160630703, "learning_rate": 2.367333333333333e-06, "loss": 0.0075, "num_tokens": 861665.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08635083585977554, "kl": 0.01907160598784685, "learning_rate": 2.367e-06, "loss": 0.001, "num_tokens": 861953.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.25, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 53.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.488400459289551, "kl": 0.07212156802415848, "learning_rate": 2.3666666666666667e-06, "loss": 0.331, "num_tokens": 862430.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.9524062871932983, "kl": 0.00367978448048234, "learning_rate": 2.3663333333333335e-06, "loss": -0.0, "num_tokens": 862714.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 53.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.041116755455732346, "kl": 0.0270707830786705, "learning_rate": 2.366e-06, "loss": 0.0011, "num_tokens": 863099.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 53.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.005555221810936928, "kl": 0.0029350891709327698, "learning_rate": 2.3656666666666667e-06, "loss": 0.0001, "num_tokens": 863335.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 53.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.013231276534497738, "kl": 0.266848623752594, "learning_rate": 2.3653333333333334e-06, "loss": 0.0133, "num_tokens": 863639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 53.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.04293171688914299, "kl": 0.008126812055706978, "learning_rate": 2.3650000000000002e-06, "loss": 0.0004, "num_tokens": 863909.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 53.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.27496668696403503, "kl": 0.028814285062253475, "learning_rate": 2.364666666666667e-06, "loss": 0.0014, "num_tokens": 864241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 53.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 5.342770576477051, "kl": 0.10318545438349247, "learning_rate": 2.3643333333333334e-06, "loss": 0.0761, "num_tokens": 864545.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 53.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.02570682018995285, "kl": 0.0009674452594481409, "learning_rate": 2.364e-06, "loss": 0.0, "num_tokens": 864863.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 53.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 2.9785778522491455, "kl": 0.14013474993407726, "learning_rate": 2.3636666666666665e-06, "loss": 0.1152, "num_tokens": 865201.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 53.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.001898481510579586, "kl": 0.0007997065840754658, "learning_rate": 2.3633333333333333e-06, "loss": 0.0, "num_tokens": 865461.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 53.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.021994970738887787, "kl": 0.002806131378747523, "learning_rate": 2.363e-06, "loss": 0.0001, "num_tokens": 865741.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 53.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.22368617355823517, "kl": 0.013450096594169736, "learning_rate": 2.362666666666667e-06, "loss": 0.0008, "num_tokens": 865963.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 53.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06795808672904968, "kl": 0.009241765830665827, "learning_rate": 2.3623333333333332e-06, "loss": 0.0005, "num_tokens": 866293.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 53.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.1688670814037323, "kl": 0.025433705188333988, "learning_rate": 2.362e-06, "loss": 0.0013, "num_tokens": 866593.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 54.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.05443761497735977, "kl": 0.007130228914320469, "learning_rate": 2.3616666666666668e-06, "loss": 0.0004, "num_tokens": 866861.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 54.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.023576179519295692, "kl": 0.005491400370374322, "learning_rate": 2.361333333333333e-06, "loss": 0.0003, "num_tokens": 867129.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 54.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09523707628250122, "kl": 0.012822123942896724, "learning_rate": 2.3610000000000003e-06, "loss": 0.0008, "num_tokens": 867391.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 54.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 7.157207489013672, "kl": 0.023810354061424732, "learning_rate": 2.3606666666666667e-06, "loss": 0.0399, "num_tokens": 867752.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 54.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017751636914908886, "kl": 3.5278499126434326e-05, "learning_rate": 2.3603333333333335e-06, "loss": 0.0, "num_tokens": 867964.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 54.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.36503705382347107, "kl": 0.03425721265375614, "learning_rate": 2.36e-06, "loss": 0.0017, "num_tokens": 868270.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 54.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.00037777298712171614, "kl": 6.160885095596313e-05, "learning_rate": 2.3596666666666666e-06, "loss": 0.0, "num_tokens": 868490.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.06831710040569305, "kl": 0.015107514336705208, "learning_rate": 2.3593333333333334e-06, "loss": 0.0008, "num_tokens": 868774.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 54.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028325184248387814, "kl": 0.0003734744241228327, "learning_rate": 2.359e-06, "loss": 0.0, "num_tokens": 869086.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 54.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.07990197837352753, "kl": 0.16134540736675262, "learning_rate": 2.358666666666667e-06, "loss": 0.0081, "num_tokens": 869397.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 54.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.17696452140808105, "kl": 0.006129124900326133, "learning_rate": 2.3583333333333333e-06, "loss": 0.0003, "num_tokens": 869671.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.13193003833293915, "kl": 0.010700775310397148, "learning_rate": 2.358e-06, "loss": 0.0005, "num_tokens": 869961.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.14096450805664062, "kl": 0.04770764522254467, "learning_rate": 2.3576666666666665e-06, "loss": 0.0024, "num_tokens": 870234.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 54.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011629119981080294, "kl": 0.0013172925100661814, "learning_rate": 2.3573333333333333e-06, "loss": 0.0001, "num_tokens": 870514.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.007323765195906162, "kl": 0.010137287434190512, "learning_rate": 2.357e-06, "loss": 0.0005, "num_tokens": 870786.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 66.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 66.0, "completions/mean_terminated_length": 2.6666667461395264, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 54.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.352905035018921, "kl": 0.007515184581279755, "learning_rate": 2.356666666666667e-06, "loss": 0.4923, "num_tokens": 871246.0, "reward": 1.7999999523162842, "reward_std": 3.4000000953674316, "rewards/reward_combined/mean": 1.7999999523162842, "rewards/reward_combined/std": 3.3999998569488525, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 54.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.006253378931432962, "kl": 0.0027934685349464417, "learning_rate": 2.356333333333333e-06, "loss": 0.0001, "num_tokens": 871482.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 54.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.04821046069264412, "kl": 0.026681117713451385, "learning_rate": 2.356e-06, "loss": 0.0013, "num_tokens": 871833.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 54.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.48042118549346924, "kl": 0.08606377243995667, "learning_rate": 2.3556666666666668e-06, "loss": 0.0042, "num_tokens": 872164.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 54.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.009499441832304, "kl": 0.0004187636077404022, "learning_rate": 2.3553333333333335e-06, "loss": 0.0, "num_tokens": 872408.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 54.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.06627187877893448, "kl": 0.008202591445297003, "learning_rate": 2.3550000000000003e-06, "loss": 0.0004, "num_tokens": 872680.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 54.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.4705772399902344, "kl": 0.030181247740983963, "learning_rate": 2.3546666666666667e-06, "loss": 0.0013, "num_tokens": 872899.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 54.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.576265811920166, "kl": 0.0294879162684083, "learning_rate": 2.3543333333333335e-06, "loss": 0.0471, "num_tokens": 873236.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 54.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.032603777945041656, "kl": 0.002134602051228285, "learning_rate": 2.354e-06, "loss": 0.0001, "num_tokens": 873504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 54.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.012353312224149704, "kl": 0.003278389573097229, "learning_rate": 2.3536666666666666e-06, "loss": 0.0002, "num_tokens": 873720.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018518518656492233, "clip_ratio/low_min": 0.018518518656492233, "clip_ratio/region_mean": 0.018518518656492233, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 54.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.3510215282440186, "kl": 0.36710023880004883, "learning_rate": 2.3533333333333334e-06, "loss": 0.0219, "num_tokens": 874119.0, "reward": 1.625, "reward_std": 1.314977765083313, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.3149778842926025, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 54.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01726127415895462, "kl": 0.005183990811929107, "learning_rate": 2.353e-06, "loss": 0.0003, "num_tokens": 874408.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 54.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.3342766761779785, "kl": 0.07593364268541336, "learning_rate": 2.352666666666667e-06, "loss": 0.0038, "num_tokens": 874746.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 54.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.11513447761535645, "kl": 0.008505598991177976, "learning_rate": 2.3523333333333333e-06, "loss": 0.0004, "num_tokens": 874980.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 54.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.13031171262264252, "kl": 0.03345475532114506, "learning_rate": 2.352e-06, "loss": 0.0017, "num_tokens": 875276.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 54.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.10780889540910721, "kl": 0.02938194014132023, "learning_rate": 2.3516666666666665e-06, "loss": 0.0015, "num_tokens": 875603.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 54.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.03160043805837631, "kl": 0.001262330886675045, "learning_rate": 2.3513333333333332e-06, "loss": 0.0001, "num_tokens": 875923.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 54.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.07768853008747101, "kl": 0.00892961397767067, "learning_rate": 2.351e-06, "loss": 0.0005, "num_tokens": 876196.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 54.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.11657459288835526, "kl": 0.004745267331600189, "learning_rate": 2.350666666666667e-06, "loss": 0.0003, "num_tokens": 876410.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 54.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.2351139485836029, "kl": 0.06614034064114094, "learning_rate": 2.3503333333333336e-06, "loss": 0.0034, "num_tokens": 876713.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.25175708532333374, "kl": 0.030793271958827972, "learning_rate": 2.35e-06, "loss": 0.0016, "num_tokens": 876995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 54.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0694553479552269, "kl": 0.0035574163775891066, "learning_rate": 2.3496666666666667e-06, "loss": 0.0002, "num_tokens": 877291.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 54.68518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 6.575503826141357, "kl": 0.1543746218085289, "learning_rate": 2.3493333333333335e-06, "loss": 0.0365, "num_tokens": 877643.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03691914677619934, "kl": 0.014725782479217742, "learning_rate": 2.3490000000000003e-06, "loss": 0.0008, "num_tokens": 877929.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 54.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.046783048659563065, "kl": 0.0012613177314051427, "learning_rate": 2.3486666666666667e-06, "loss": 0.0001, "num_tokens": 878185.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 54.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.2654917538166046, "kl": 0.017351491376757622, "learning_rate": 2.3483333333333334e-06, "loss": 0.0009, "num_tokens": 878515.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 54.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.12645743787288666, "kl": 0.02534924726933241, "learning_rate": 2.348e-06, "loss": 0.0013, "num_tokens": 878849.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 54.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.01918371580541134, "kl": 0.0005114320665597916, "learning_rate": 2.3476666666666666e-06, "loss": 0.0, "num_tokens": 879161.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 54.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019872759003192186, "kl": 0.0004409998655319214, "learning_rate": 2.3473333333333334e-06, "loss": 0.0, "num_tokens": 879433.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 54.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.01509452611207962, "kl": 0.2664918303489685, "learning_rate": 2.347e-06, "loss": 0.0133, "num_tokens": 879737.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 54.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.12018117308616638, "kl": 0.012594382744282484, "learning_rate": 2.346666666666667e-06, "loss": 0.0007, "num_tokens": 880061.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 54.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.010643661953508854, "kl": 0.0010574540356174111, "learning_rate": 2.3463333333333333e-06, "loss": 0.0001, "num_tokens": 880323.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 54.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.09327709674835205, "kl": 0.08426785469055176, "learning_rate": 2.346e-06, "loss": 0.0042, "num_tokens": 880687.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 54.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.008367214351892471, "kl": 0.0008837968052830547, "learning_rate": 2.3456666666666664e-06, "loss": 0.0, "num_tokens": 880947.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 54.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.3067464232444763, "kl": 0.07646085321903229, "learning_rate": 2.3453333333333336e-06, "loss": 0.003, "num_tokens": 881285.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 54.925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 3.5075974464416504, "kl": 0.057480476796627045, "learning_rate": 2.345e-06, "loss": 0.0485, "num_tokens": 881666.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 54.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.046370603144168854, "kl": 0.007213607896119356, "learning_rate": 2.3446666666666668e-06, "loss": 0.0004, "num_tokens": 882000.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 54.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035865188110619783, "kl": 0.016245152801275253, "learning_rate": 2.3443333333333336e-06, "loss": 0.0008, "num_tokens": 882260.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 54.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08152646571397781, "kl": 0.008194145280867815, "learning_rate": 2.344e-06, "loss": 0.0004, "num_tokens": 882549.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 55.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.025813622400164604, "kl": 0.0010362975299358368, "learning_rate": 2.3436666666666667e-06, "loss": 0.0001, "num_tokens": 882809.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 55.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.762274980545044, "kl": 0.11066636070609093, "learning_rate": 2.3433333333333335e-06, "loss": 0.0606, "num_tokens": 883152.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 55.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06702513992786407, "kl": 0.027682156302034855, "learning_rate": 2.3430000000000003e-06, "loss": 0.0014, "num_tokens": 883455.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 55.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.3494166135787964, "kl": 0.04059334844350815, "learning_rate": 2.3426666666666666e-06, "loss": 0.0017, "num_tokens": 883773.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 55.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.041744161397218704, "kl": 0.012252789922058582, "learning_rate": 2.3423333333333334e-06, "loss": 0.0006, "num_tokens": 884179.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 55.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.07226499915122986, "kl": 0.007680713664740324, "learning_rate": 2.3419999999999998e-06, "loss": 0.0004, "num_tokens": 884449.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 55.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.23819291591644287, "kl": 0.04472205974161625, "learning_rate": 2.3416666666666666e-06, "loss": 0.0022, "num_tokens": 884810.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 55.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003592208959162235, "kl": 6.392598152160645e-05, "learning_rate": 2.3413333333333338e-06, "loss": 0.0, "num_tokens": 885030.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 55.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.23653821647167206, "kl": 0.02786953785107471, "learning_rate": 2.341e-06, "loss": 0.0014, "num_tokens": 885298.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 55.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.1875661462545395, "kl": 0.015617812983691692, "learning_rate": 2.340666666666667e-06, "loss": 0.0007, "num_tokens": 885532.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 55.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 4.101525783538818, "kl": 0.10640087351202965, "learning_rate": 2.3403333333333333e-06, "loss": 0.0224, "num_tokens": 885854.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 55.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.115035280585289, "kl": 0.03137406148016453, "learning_rate": 2.34e-06, "loss": 0.0016, "num_tokens": 886148.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 55.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.10560853779315948, "kl": 0.022690760903060436, "learning_rate": 2.339666666666667e-06, "loss": 0.0012, "num_tokens": 886492.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 55.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.039279937744140625, "kl": 0.008485760539770126, "learning_rate": 2.3393333333333336e-06, "loss": 0.0004, "num_tokens": 886780.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 55.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07722441107034683, "kl": 0.010541489813476801, "learning_rate": 2.339e-06, "loss": 0.0005, "num_tokens": 887086.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 84.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 84.25, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 55.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.4168365001678467, "kl": 0.03890683501958847, "learning_rate": 2.3386666666666668e-06, "loss": 0.3008, "num_tokens": 887639.0, "reward": 3.424999952316284, "reward_std": 3.797696828842163, "rewards/reward_combined/mean": 3.424999952316284, "rewards/reward_combined/std": 3.797696590423584, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 55.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1464974731206894, "kl": 0.019229216501116753, "learning_rate": 2.3383333333333335e-06, "loss": 0.001, "num_tokens": 887923.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 82.75, "completions/mean_terminated_length": 25.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 55.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 1.5898351669311523, "kl": 0.026735293678939342, "learning_rate": 2.338e-06, "loss": 0.4322, "num_tokens": 888506.0, "reward": 5.550000190734863, "reward_std": 3.9000000953674316, "rewards/reward_combined/mean": 5.550000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 55.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.02812797762453556, "kl": 0.003407878102734685, "learning_rate": 2.3376666666666667e-06, "loss": 0.0002, "num_tokens": 888802.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 55.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.04721495509147644, "kl": 0.007408793084323406, "learning_rate": 2.3373333333333335e-06, "loss": 0.0004, "num_tokens": 889123.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 55.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.2649492621421814, "kl": 0.019243795075453818, "learning_rate": 2.3370000000000002e-06, "loss": 0.0006, "num_tokens": 889377.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 55.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.055546633899211884, "kl": 0.026678916066884995, "learning_rate": 2.3366666666666666e-06, "loss": 0.0013, "num_tokens": 889795.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 55.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.46795764565467834, "kl": 0.0460997000336647, "learning_rate": 2.3363333333333334e-06, "loss": 0.0023, "num_tokens": 890055.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 55.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.006224581506103277, "kl": 0.00042625516653060913, "learning_rate": 2.3359999999999997e-06, "loss": 0.0, "num_tokens": 890299.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 55.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.7861798405647278, "kl": 0.10501637309789658, "learning_rate": 2.3356666666666665e-06, "loss": 0.0056, "num_tokens": 890613.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 55.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.3208851218223572, "kl": 0.04298049118369818, "learning_rate": 2.3353333333333337e-06, "loss": 0.0022, "num_tokens": 890904.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 55.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07854863256216049, "kl": 0.15339665114879608, "learning_rate": 2.335e-06, "loss": 0.0077, "num_tokens": 891217.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 55.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.008497952483594418, "kl": 0.00971553847193718, "learning_rate": 2.334666666666667e-06, "loss": 0.0005, "num_tokens": 891489.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 55.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04913301765918732, "kl": 0.005250135902315378, "learning_rate": 2.3343333333333332e-06, "loss": 0.0003, "num_tokens": 891761.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 55.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05256142467260361, "kl": 0.010212956462055445, "learning_rate": 2.334e-06, "loss": 0.0005, "num_tokens": 892052.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 55.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.948071241378784, "kl": 0.007339737727306783, "learning_rate": 2.333666666666667e-06, "loss": -0.0483, "num_tokens": 892343.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 55.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.006259546149522066, "kl": 0.0010361314052715898, "learning_rate": 2.3333333333333336e-06, "loss": 0.0001, "num_tokens": 892603.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 55.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.004210996441543102, "kl": 0.016116227954626083, "learning_rate": 2.333e-06, "loss": 0.0008, "num_tokens": 892863.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 55.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.001128842937760055, "kl": 0.0012331880861893296, "learning_rate": 2.3326666666666667e-06, "loss": 0.0001, "num_tokens": 893143.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 55.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.006712694652378559, "kl": 0.0003460347652435303, "learning_rate": 2.3323333333333335e-06, "loss": 0.0, "num_tokens": 893363.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 55.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.10337196290493011, "kl": 0.009352536872029305, "learning_rate": 2.332e-06, "loss": 0.0005, "num_tokens": 893633.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 55.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.21204231679439545, "kl": 0.014737354591488838, "learning_rate": 2.3316666666666666e-06, "loss": 0.0007, "num_tokens": 893899.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3006 }, { "clip_ratio/high_max": 0.008064515888690948, "clip_ratio/high_mean": 0.008064515888690948, "clip_ratio/low_mean": 0.007936508394777775, "clip_ratio/low_min": 0.007936508394777775, "clip_ratio/region_mean": 0.016001024283468723, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 55.68518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 2.7153398990631104, "kl": 0.07125603780150414, "learning_rate": 2.3313333333333334e-06, "loss": 0.032, "num_tokens": 894252.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 3007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 55.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10559558868408203, "kl": 0.002482399344444275, "learning_rate": 2.3310000000000002e-06, "loss": 0.0001, "num_tokens": 894464.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009259259328246117, "clip_ratio/low_min": 0.009259259328246117, "clip_ratio/region_mean": 0.009259259328246117, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 55.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.876154899597168, "kl": 0.5876728873699903, "learning_rate": 2.3306666666666666e-06, "loss": 0.0493, "num_tokens": 894780.0, "reward": 2.375, "reward_std": 1.8874585628509521, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.8874585628509521, "step": 3009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 55.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.16329513490200043, "kl": 0.020946836099028587, "learning_rate": 2.3303333333333334e-06, "loss": 0.001, "num_tokens": 895062.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 55.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07556506991386414, "kl": 0.02021302655339241, "learning_rate": 2.3299999999999997e-06, "loss": 0.001, "num_tokens": 895394.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 55.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.021483929827809334, "kl": 0.0024198753526434302, "learning_rate": 2.329666666666667e-06, "loss": 0.0001, "num_tokens": 895726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 55.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 13.228107452392578, "kl": 0.031029794365167618, "learning_rate": 2.3293333333333337e-06, "loss": 0.1464, "num_tokens": 895965.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 55.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.369430273771286, "kl": 0.02256050705909729, "learning_rate": 2.329e-06, "loss": 0.0011, "num_tokens": 896181.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 55.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.05093751475214958, "kl": 0.09115274250507355, "learning_rate": 2.328666666666667e-06, "loss": 0.0046, "num_tokens": 896547.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012195121496915817, "clip_ratio/low_min": 0.012195121496915817, "clip_ratio/region_mean": 0.012195121496915817, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 55.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 4.179732322692871, "kl": 0.002333106007426977, "learning_rate": 2.328333333333333e-06, "loss": 0.0557, "num_tokens": 896864.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 55.870370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 3.867720365524292, "kl": 0.05079939030110836, "learning_rate": 2.328e-06, "loss": -0.0267, "num_tokens": 897169.0, "reward": 4.5, "reward_std": 2.345207929611206, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.345207929611206, "step": 3017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 55.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.021309273317456245, "kl": 0.26525846123695374, "learning_rate": 2.3276666666666668e-06, "loss": 0.0133, "num_tokens": 897473.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 55.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.016170399263501167, "kl": 0.00035906137782149017, "learning_rate": 2.3273333333333336e-06, "loss": 0.0, "num_tokens": 897729.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 55.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0910065621137619, "kl": 0.007152083155233413, "learning_rate": 2.327e-06, "loss": 0.0003, "num_tokens": 898008.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 55.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.20022736489772797, "kl": 0.011388115584850311, "learning_rate": 2.3266666666666667e-06, "loss": 0.0006, "num_tokens": 898220.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 55.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.031021635979413986, "kl": 0.0023814737796783447, "learning_rate": 2.3263333333333335e-06, "loss": 0.0001, "num_tokens": 898432.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 55.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01577245071530342, "kl": 0.0012783058336935937, "learning_rate": 2.326e-06, "loss": 0.0001, "num_tokens": 898706.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 56.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.15930642187595367, "kl": 0.03882480412721634, "learning_rate": 2.3256666666666666e-06, "loss": 0.0019, "num_tokens": 898981.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 56.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.002751645166426897, "kl": 0.0033688247203826904, "learning_rate": 2.3253333333333334e-06, "loss": 0.0002, "num_tokens": 899217.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 56.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.015981562435626984, "kl": 0.0035631279461085796, "learning_rate": 2.325e-06, "loss": 0.0002, "num_tokens": 899509.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.12137234210968018, "kl": 0.005759598687291145, "learning_rate": 2.3246666666666665e-06, "loss": 0.0003, "num_tokens": 899771.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 56.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036598711740225554, "kl": 0.016190843656659126, "learning_rate": 2.3243333333333333e-06, "loss": 0.0008, "num_tokens": 900031.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 56.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.06579843908548355, "kl": 0.0029603760922327638, "learning_rate": 2.324e-06, "loss": 0.0001, "num_tokens": 900264.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 56.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.042218174785375595, "kl": 0.024056091904640198, "learning_rate": 2.323666666666667e-06, "loss": 0.0013, "num_tokens": 900553.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 56.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.022274121642112732, "kl": 0.2651331424713135, "learning_rate": 2.3233333333333337e-06, "loss": 0.0133, "num_tokens": 900857.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 56.148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 4.141805171966553, "kl": 0.07211090251803398, "learning_rate": 2.323e-06, "loss": 0.0458, "num_tokens": 901153.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 56.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 3.002310037612915, "kl": 0.10307259391993284, "learning_rate": 2.322666666666667e-06, "loss": -0.0001, "num_tokens": 901441.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 6.237639904022217, "kl": 0.0351610891520977, "learning_rate": 2.322333333333333e-06, "loss": 0.0439, "num_tokens": 901715.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 56.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.2782034873962402, "kl": 0.025777772068977356, "learning_rate": 2.322e-06, "loss": 0.0102, "num_tokens": 902046.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 56.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.07503416389226913, "kl": 0.004768489394336939, "learning_rate": 2.3216666666666667e-06, "loss": 0.0002, "num_tokens": 902364.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 56.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05003504827618599, "kl": 0.010098483297042549, "learning_rate": 2.3213333333333335e-06, "loss": 0.0005, "num_tokens": 902691.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 56.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.3545469641685486, "kl": 0.061767819337546825, "learning_rate": 2.321e-06, "loss": 0.0032, "num_tokens": 903047.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 56.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.020279807969927788, "kl": 0.0008650238742120564, "learning_rate": 2.3206666666666667e-06, "loss": 0.0, "num_tokens": 903317.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.026220832020044327, "kl": 0.006930738687515259, "learning_rate": 2.3203333333333335e-06, "loss": 0.0003, "num_tokens": 903585.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 56.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.04772263392806053, "kl": 0.005179248750209808, "learning_rate": 2.32e-06, "loss": 0.0003, "num_tokens": 903795.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 56.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.3156445324420929, "kl": 0.09944610297679901, "learning_rate": 2.319666666666667e-06, "loss": 0.0043, "num_tokens": 904153.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 56.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.012312165461480618, "kl": 0.0016637109220027924, "learning_rate": 2.3193333333333334e-06, "loss": 0.0001, "num_tokens": 904465.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 56.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 4.618168354034424, "kl": 0.3945522963767871, "learning_rate": 2.319e-06, "loss": 0.0158, "num_tokens": 904725.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 56.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.10125128924846649, "kl": 0.019602932035923004, "learning_rate": 2.3186666666666665e-06, "loss": 0.001, "num_tokens": 905027.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 56.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.26650863885879517, "kl": 0.05305817723274231, "learning_rate": 2.3183333333333333e-06, "loss": 0.0027, "num_tokens": 905323.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 56.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.028388069942593575, "kl": 0.0015472486848011613, "learning_rate": 2.318e-06, "loss": 0.0001, "num_tokens": 905577.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 56.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.050574012100696564, "kl": 0.010240362957119942, "learning_rate": 2.317666666666667e-06, "loss": 0.0005, "num_tokens": 905883.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 56.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1809522956609726, "kl": 0.01540428027510643, "learning_rate": 2.3173333333333336e-06, "loss": 0.0008, "num_tokens": 906143.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 56.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.3351411819458008, "kl": 0.04343859851360321, "learning_rate": 2.317e-06, "loss": 0.0027, "num_tokens": 906414.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 56.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.26799216866493225, "kl": 0.04025677964091301, "learning_rate": 2.316666666666667e-06, "loss": 0.002, "num_tokens": 906732.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 56.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.867607593536377, "kl": 0.1391044482588768, "learning_rate": 2.316333333333333e-06, "loss": -0.0006, "num_tokens": 907135.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 56.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.280645847320557, "kl": 0.009545482462272048, "learning_rate": 2.316e-06, "loss": 0.0354, "num_tokens": 907462.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 56.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.00037280612741596997, "kl": 4.4927000999450684e-05, "learning_rate": 2.3156666666666667e-06, "loss": 0.0, "num_tokens": 907682.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 56.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.006934404838830233, "kl": 0.0004408538370626047, "learning_rate": 2.3153333333333335e-06, "loss": 0.0, "num_tokens": 907902.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 56.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0731884092092514, "kl": 0.005249355337582529, "learning_rate": 2.315e-06, "loss": 0.0003, "num_tokens": 908172.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 56.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.004869671072810888, "kl": 0.0001500844955444336, "learning_rate": 2.3146666666666666e-06, "loss": 0.0, "num_tokens": 908384.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 56.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.29635345935821533, "kl": 0.0346414668019861, "learning_rate": 2.3143333333333334e-06, "loss": 0.0017, "num_tokens": 908668.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 56.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 3.1607277393341064, "kl": 0.0721521582454443, "learning_rate": 2.314e-06, "loss": -0.0737, "num_tokens": 909023.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 3059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 56.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.07834595441818237, "kl": 0.005899720126762986, "learning_rate": 2.313666666666667e-06, "loss": 0.0003, "num_tokens": 909307.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 56.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.37481996417045593, "kl": 0.04803141113370657, "learning_rate": 2.3133333333333333e-06, "loss": 0.0029, "num_tokens": 909589.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 56.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06571433693170547, "kl": 0.003023725701496005, "learning_rate": 2.313e-06, "loss": 0.0002, "num_tokens": 909845.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 56.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.027004772797226906, "kl": 0.0009928946965374053, "learning_rate": 2.3126666666666665e-06, "loss": 0.0, "num_tokens": 910088.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 56.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.12884853780269623, "kl": 0.0746908187866211, "learning_rate": 2.3123333333333333e-06, "loss": 0.0037, "num_tokens": 910452.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.024170802906155586, "kl": 0.0031657739309594035, "learning_rate": 2.312e-06, "loss": 0.0002, "num_tokens": 910750.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 56.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.06535699218511581, "kl": 0.00893158046528697, "learning_rate": 2.311666666666667e-06, "loss": 0.0004, "num_tokens": 911039.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 56.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004719328135251999, "kl": 0.0029603242874145508, "learning_rate": 2.3113333333333336e-06, "loss": 0.0001, "num_tokens": 911255.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 56.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 2.5798566341400146, "kl": 0.07439298555254936, "learning_rate": 2.311e-06, "loss": 0.0018, "num_tokens": 911611.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 56.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.06750567257404327, "kl": 0.05538497120141983, "learning_rate": 2.3106666666666668e-06, "loss": 0.0028, "num_tokens": 911944.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.016536394134163857, "kl": 0.001628221827559173, "learning_rate": 2.310333333333333e-06, "loss": 0.0001, "num_tokens": 912221.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 56.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.05181695520877838, "kl": 0.0023673070245422423, "learning_rate": 2.31e-06, "loss": 0.0001, "num_tokens": 912532.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 56.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.08608939498662949, "kl": 0.15065090358257294, "learning_rate": 2.3096666666666667e-06, "loss": 0.0075, "num_tokens": 912849.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 56.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.07637684792280197, "kl": 0.015827403403818607, "learning_rate": 2.3093333333333335e-06, "loss": 0.0008, "num_tokens": 913177.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 56.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.09212353825569153, "kl": 0.0043697357177734375, "learning_rate": 2.3090000000000003e-06, "loss": 0.0002, "num_tokens": 913389.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.018798593431711197, "kl": 0.0020437620114535093, "learning_rate": 2.3086666666666666e-06, "loss": 0.0001, "num_tokens": 913657.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 56.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.522083282470703, "kl": 0.027398478239774704, "learning_rate": 2.3083333333333334e-06, "loss": 0.3735, "num_tokens": 914000.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 56.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.773656368255615, "kl": 0.14111773297190666, "learning_rate": 2.308e-06, "loss": -0.0063, "num_tokens": 914315.0, "reward": 2.75, "reward_std": 3.9686269760131836, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.9686269760131836, "step": 3077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 57.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09969069063663483, "kl": 0.008085519599262625, "learning_rate": 2.307666666666667e-06, "loss": 0.0004, "num_tokens": 914660.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 57.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 1.306377649307251, "kl": 0.12525998149067163, "learning_rate": 2.3073333333333333e-06, "loss": 0.0067, "num_tokens": 914934.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 57.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.13874602317810059, "kl": 0.03274434059858322, "learning_rate": 2.307e-06, "loss": 0.0016, "num_tokens": 915228.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 57.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06902331858873367, "kl": 0.010074528399854898, "learning_rate": 2.3066666666666665e-06, "loss": 0.0005, "num_tokens": 915558.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 57.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 6.246304035186768, "kl": 0.00846839026780799, "learning_rate": 2.3063333333333332e-06, "loss": 0.0727, "num_tokens": 915897.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 57.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.16027773916721344, "kl": 0.0288890665397048, "learning_rate": 2.306e-06, "loss": 0.0014, "num_tokens": 916169.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 57.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003709029115270823, "kl": 4.108995199203491e-05, "learning_rate": 2.305666666666667e-06, "loss": 0.0, "num_tokens": 916389.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 57.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 3.7916741371154785, "kl": 0.0329713923856616, "learning_rate": 2.3053333333333336e-06, "loss": 0.1511, "num_tokens": 916725.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 57.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007831676630303264, "kl": 0.0012073814868927002, "learning_rate": 2.305e-06, "loss": 0.0001, "num_tokens": 917005.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 57.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.03505430370569229, "kl": 0.005548796383664012, "learning_rate": 2.3046666666666667e-06, "loss": 0.0003, "num_tokens": 917276.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 57.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.05358888581395149, "kl": 0.0027505953039508313, "learning_rate": 2.304333333333333e-06, "loss": 0.0001, "num_tokens": 917511.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 57.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005964314099401236, "kl": 0.002862304449081421, "learning_rate": 2.3040000000000003e-06, "loss": 0.0001, "num_tokens": 917727.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 57.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0240944791585207, "kl": 0.00037530362169491127, "learning_rate": 2.3036666666666667e-06, "loss": 0.0, "num_tokens": 917983.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 57.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.030668575316667557, "kl": 0.2634844481945038, "learning_rate": 2.3033333333333334e-06, "loss": 0.0132, "num_tokens": 918287.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 57.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03100777044892311, "kl": 0.001269012689590454, "learning_rate": 2.3030000000000002e-06, "loss": 0.0001, "num_tokens": 918557.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 57.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.06761438399553299, "kl": 0.010994312353432178, "learning_rate": 2.3026666666666666e-06, "loss": 0.0005, "num_tokens": 918861.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 57.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031846552155911922, "kl": 6.537884473800659e-05, "learning_rate": 2.3023333333333334e-06, "loss": 0.0, "num_tokens": 919073.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 57.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.11781570315361023, "kl": 0.012454411946237087, "learning_rate": 2.302e-06, "loss": 0.0007, "num_tokens": 919339.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 90.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 90.25, "completions/mean_terminated_length": 35.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 57.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 3.5489165782928467, "kl": 0.05572972074151039, "learning_rate": 2.301666666666667e-06, "loss": 0.3391, "num_tokens": 919916.0, "reward": 1.625, "reward_std": 2.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 2.25, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 57.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.025521112605929375, "kl": 0.10341102629899979, "learning_rate": 2.3013333333333333e-06, "loss": 0.0053, "num_tokens": 920284.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 57.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.10133406519889832, "kl": 0.007326021790504456, "learning_rate": 2.301e-06, "loss": 0.0004, "num_tokens": 920528.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 57.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.03059845231473446, "kl": 0.0007258206605911255, "learning_rate": 2.3006666666666664e-06, "loss": 0.0, "num_tokens": 920740.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 57.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.060716625303030014, "kl": 0.0074819540604949, "learning_rate": 2.3003333333333332e-06, "loss": 0.0004, "num_tokens": 921028.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 57.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.10100864619016647, "kl": 0.02005849126726389, "learning_rate": 2.3000000000000004e-06, "loss": 0.001, "num_tokens": 921312.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 57.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.10387787967920303, "kl": 0.0436395313590765, "learning_rate": 2.2996666666666668e-06, "loss": 0.0022, "num_tokens": 921720.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 57.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.024397222325205803, "kl": 0.0027708099223673344, "learning_rate": 2.2993333333333336e-06, "loss": 0.0001, "num_tokens": 922016.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 57.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0444328710436821, "kl": 0.004600699990987778, "learning_rate": 2.299e-06, "loss": 0.0002, "num_tokens": 922328.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 57.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.1559734046459198, "kl": 0.012478619813919067, "learning_rate": 2.2986666666666667e-06, "loss": 0.0006, "num_tokens": 922588.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 57.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04167880862951279, "kl": 0.009483292698860168, "learning_rate": 2.2983333333333335e-06, "loss": 0.0005, "num_tokens": 922879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 57.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.077730417251587, "kl": 0.05038454011082649, "learning_rate": 2.2980000000000003e-06, "loss": 0.0486, "num_tokens": 923218.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 3107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 57.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.011280560865998268, "kl": 0.000858005863847211, "learning_rate": 2.2976666666666666e-06, "loss": 0.0, "num_tokens": 923484.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 57.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.028662335127592087, "kl": 0.005231330171227455, "learning_rate": 2.2973333333333334e-06, "loss": 0.0003, "num_tokens": 923788.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 57.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.2523195445537567, "kl": 0.017590429866686463, "learning_rate": 2.297e-06, "loss": 0.0008, "num_tokens": 924056.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 57.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.027332987636327744, "kl": 0.0031800430733710527, "learning_rate": 2.2966666666666666e-06, "loss": 0.0002, "num_tokens": 924338.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 57.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.12399964779615402, "kl": 0.03697334788739681, "learning_rate": 2.2963333333333333e-06, "loss": 0.0019, "num_tokens": 924708.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 57.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.08509934693574905, "kl": 0.018704267218708992, "learning_rate": 2.296e-06, "loss": 0.0009, "num_tokens": 925118.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 57.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.004579869564622641, "kl": 0.0002828136057360098, "learning_rate": 2.295666666666667e-06, "loss": 0.0, "num_tokens": 925432.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 57.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.003519898047670722, "kl": 0.01617850735783577, "learning_rate": 2.2953333333333333e-06, "loss": 0.0008, "num_tokens": 925692.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 57.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.026258887723088264, "kl": 0.0015709161525592208, "learning_rate": 2.295e-06, "loss": 0.0001, "num_tokens": 925952.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3116 }, { "clip_ratio/high_max": 0.011627906933426857, "clip_ratio/high_mean": 0.011627906933426857, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011627906933426857, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 57.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.801479816436768, "kl": 0.02480255998671055, "learning_rate": 2.2946666666666664e-06, "loss": -0.0741, "num_tokens": 926264.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 57.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04094352573156357, "kl": 0.01576678641140461, "learning_rate": 2.294333333333333e-06, "loss": 0.0008, "num_tokens": 926570.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 57.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0319712795317173, "kl": 0.0007952234009280801, "learning_rate": 2.2940000000000004e-06, "loss": 0.0, "num_tokens": 926854.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 57.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.05532447248697281, "kl": 0.007991239661350846, "learning_rate": 2.2936666666666668e-06, "loss": 0.0004, "num_tokens": 927176.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 57.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.8568344116210938, "kl": 0.14629393070936203, "learning_rate": 2.2933333333333335e-06, "loss": -0.0124, "num_tokens": 927548.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 57.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.04195952042937279, "kl": 0.0013035978190600872, "learning_rate": 2.293e-06, "loss": 0.0001, "num_tokens": 927770.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 57.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.07107017189264297, "kl": 0.006461275741457939, "learning_rate": 2.2926666666666667e-06, "loss": 0.0003, "num_tokens": 928042.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 57.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.025414487347006798, "kl": 0.004055418074131012, "learning_rate": 2.2923333333333335e-06, "loss": 0.0002, "num_tokens": 928252.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 57.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.060030724853277206, "kl": 0.004811800085008144, "learning_rate": 2.2920000000000002e-06, "loss": 0.0002, "num_tokens": 928571.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 57.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.39573654532432556, "kl": 0.023475729685742408, "learning_rate": 2.2916666666666666e-06, "loss": 0.0014, "num_tokens": 928836.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 57.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.92252779006958, "kl": 0.037057699635624886, "learning_rate": 2.2913333333333334e-06, "loss": -0.0153, "num_tokens": 929207.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 57.925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 4.137658596038818, "kl": 0.04328920692205429, "learning_rate": 2.291e-06, "loss": -0.1235, "num_tokens": 929529.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 57.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.050064437091350555, "kl": 0.0074186623096466064, "learning_rate": 2.2906666666666665e-06, "loss": 0.0004, "num_tokens": 929797.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 57.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.9779375791549683, "kl": 0.014265389880165458, "learning_rate": 2.2903333333333333e-06, "loss": -0.0853, "num_tokens": 930140.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 57.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06911390274763107, "kl": 0.15921716392040253, "learning_rate": 2.29e-06, "loss": 0.008, "num_tokens": 930450.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021713341120630503, "kl": 0.0034520328044891357, "learning_rate": 2.289666666666667e-06, "loss": 0.0002, "num_tokens": 930686.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 58.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.2619903087615967, "kl": 0.13335754722356796, "learning_rate": 2.2893333333333332e-06, "loss": -0.0579, "num_tokens": 930983.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 58.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.18223130702972412, "kl": 0.03682851418852806, "learning_rate": 2.289e-06, "loss": 0.0019, "num_tokens": 931297.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 58.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.16124214231967926, "kl": 0.02562273107469082, "learning_rate": 2.2886666666666664e-06, "loss": 0.0014, "num_tokens": 931591.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 58.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.048398155719041824, "kl": 0.0190586696844548, "learning_rate": 2.2883333333333336e-06, "loss": 0.001, "num_tokens": 931869.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 58.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.03625788167119026, "kl": 0.0075961456168442965, "learning_rate": 2.2880000000000004e-06, "loss": 0.0004, "num_tokens": 932156.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 58.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.03786690533161163, "kl": 0.10377553105354309, "learning_rate": 2.2876666666666667e-06, "loss": 0.0053, "num_tokens": 932524.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 58.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.03660387173295021, "kl": 0.011968305916525424, "learning_rate": 2.2873333333333335e-06, "loss": 0.0006, "num_tokens": 932810.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 58.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0543820858001709, "kl": 0.002412225818261504, "learning_rate": 2.287e-06, "loss": 0.0001, "num_tokens": 933130.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 58.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.023981111124157906, "kl": 0.0027115034172311425, "learning_rate": 2.2866666666666667e-06, "loss": 0.0001, "num_tokens": 933426.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.03546467795968056, "kl": 0.00028955191373825073, "learning_rate": 2.2863333333333334e-06, "loss": 0.0, "num_tokens": 933638.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 58.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.003263151040300727, "kl": 0.00019640723621705547, "learning_rate": 2.2860000000000002e-06, "loss": 0.0, "num_tokens": 933950.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375699609518051, "kl": 0.010075107216835022, "learning_rate": 2.2856666666666666e-06, "loss": 0.0005, "num_tokens": 934166.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 58.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10350469499826431, "kl": 0.03227981645613909, "learning_rate": 2.2853333333333334e-06, "loss": 0.0016, "num_tokens": 934442.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 58.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.021121809259057045, "kl": 0.007212073542177677, "learning_rate": 2.285e-06, "loss": 0.0004, "num_tokens": 934748.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 58.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.05517319589853287, "kl": 0.011987493140622973, "learning_rate": 2.2846666666666665e-06, "loss": 0.0006, "num_tokens": 935083.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 58.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.12442062050104141, "kl": 0.018907058984041214, "learning_rate": 2.2843333333333333e-06, "loss": 0.001, "num_tokens": 935371.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 58.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.6243205070495605, "kl": 0.04667986184358597, "learning_rate": 2.284e-06, "loss": 0.0016, "num_tokens": 935711.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 3149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 58.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.15774016082286835, "kl": 0.16984020173549652, "learning_rate": 2.283666666666667e-06, "loss": 0.0085, "num_tokens": 936021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 58.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.032631825655698776, "kl": 0.0012128648231737316, "learning_rate": 2.2833333333333332e-06, "loss": 0.0001, "num_tokens": 936270.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 58.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 4.017707347869873, "kl": 0.07463878626003861, "learning_rate": 2.283e-06, "loss": -0.1594, "num_tokens": 936542.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 58.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 4.816429615020752, "kl": 0.030274469638243318, "learning_rate": 2.2826666666666668e-06, "loss": 0.0935, "num_tokens": 936825.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 3153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 58.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.10214201360940933, "kl": 0.0058991871774196625, "learning_rate": 2.2823333333333336e-06, "loss": 0.0003, "num_tokens": 937085.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 58.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.006219821982085705, "kl": 0.0007234037038870156, "learning_rate": 2.2820000000000003e-06, "loss": 0.0, "num_tokens": 937345.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03278283029794693, "kl": 0.003994007944129407, "learning_rate": 2.2816666666666667e-06, "loss": 0.0002, "num_tokens": 937559.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 58.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.007297915406525135, "kl": 0.00059682727442123, "learning_rate": 2.2813333333333335e-06, "loss": 0.0, "num_tokens": 937779.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 58.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.10067549347877502, "kl": 0.008874677121639252, "learning_rate": 2.281e-06, "loss": 0.0005, "num_tokens": 938109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 58.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.5791587829589844, "kl": 0.060645993798971176, "learning_rate": 2.2806666666666666e-06, "loss": 0.1261, "num_tokens": 938482.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 3159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 58.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.38537588715553284, "kl": 0.04332096315920353, "learning_rate": 2.2803333333333334e-06, "loss": 0.0021, "num_tokens": 938813.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 85.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 85.75, "completions/mean_terminated_length": 29.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 58.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.909480094909668, "kl": 0.015507086180150509, "learning_rate": 2.28e-06, "loss": 0.228, "num_tokens": 939408.0, "reward": 3.924999952316284, "reward_std": 4.660739898681641, "rewards/reward_combined/mean": 3.924999952316284, "rewards/reward_combined/std": 4.660740375518799, "step": 3161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 58.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04581165686249733, "kl": 0.013135399203747511, "learning_rate": 2.2796666666666666e-06, "loss": 0.0007, "num_tokens": 939714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 58.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.06691695749759674, "kl": 0.010791434440761805, "learning_rate": 2.2793333333333333e-06, "loss": 0.0006, "num_tokens": 940040.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 58.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 3.299450397491455, "kl": 0.23470115661621094, "learning_rate": 2.279e-06, "loss": -0.0392, "num_tokens": 940374.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 3164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 58.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.002639294834807515, "kl": 0.0015660664066672325, "learning_rate": 2.2786666666666665e-06, "loss": 0.0001, "num_tokens": 940686.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 58.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0319683738052845, "kl": 0.0013311710208654404, "learning_rate": 2.2783333333333337e-06, "loss": 0.0001, "num_tokens": 940953.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006410256493836641, "clip_ratio/low_min": 0.006410256493836641, "clip_ratio/region_mean": 0.006410256493836641, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 58.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 2.21768856048584, "kl": 0.06081162393093109, "learning_rate": 2.278e-06, "loss": 0.0018, "num_tokens": 941322.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 3167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 58.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.00885734986513853, "kl": 0.000110626220703125, "learning_rate": 2.277666666666667e-06, "loss": 0.0, "num_tokens": 941534.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 58.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.05073962360620499, "kl": 0.0059238689718768, "learning_rate": 2.277333333333333e-06, "loss": 0.0003, "num_tokens": 941823.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 58.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.2143610715866089, "kl": 0.07582800090312958, "learning_rate": 2.277e-06, "loss": 0.0038, "num_tokens": 942220.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 58.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.7463741302490234, "kl": 0.038494925014674664, "learning_rate": 2.2766666666666668e-06, "loss": 0.1325, "num_tokens": 942489.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 58.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04515834525227547, "kl": 0.0058713669423013926, "learning_rate": 2.2763333333333335e-06, "loss": 0.0003, "num_tokens": 942771.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 58.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.432664155960083, "kl": 0.03460339084267616, "learning_rate": 2.2760000000000003e-06, "loss": -0.0464, "num_tokens": 943179.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.002696047071367502, "kl": 0.0033345669507980347, "learning_rate": 2.2756666666666667e-06, "loss": 0.0002, "num_tokens": 943415.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 58.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0314471498131752, "kl": 0.26328007876873016, "learning_rate": 2.2753333333333335e-06, "loss": 0.0132, "num_tokens": 943719.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 58.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.275594711303711, "kl": 0.15882434137165546, "learning_rate": 2.275e-06, "loss": 0.0333, "num_tokens": 944035.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 3176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 58.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.00265194708481431, "kl": 0.01631934382021427, "learning_rate": 2.2746666666666666e-06, "loss": 0.0008, "num_tokens": 944295.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 58.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.01292788702994585, "kl": 0.00037716925726272166, "learning_rate": 2.2743333333333334e-06, "loss": 0.0, "num_tokens": 944551.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 58.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009411151404492557, "kl": 0.0012224827660247684, "learning_rate": 2.274e-06, "loss": 0.0001, "num_tokens": 944831.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003616356698330492, "kl": 3.196299076080322e-05, "learning_rate": 2.2736666666666665e-06, "loss": 0.0, "num_tokens": 945051.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 58.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.010120888240635395, "kl": 0.000500300811836496, "learning_rate": 2.2733333333333333e-06, "loss": 0.0, "num_tokens": 945286.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 58.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.015457162633538246, "kl": 0.0005570801731664687, "learning_rate": 2.273e-06, "loss": 0.0, "num_tokens": 945548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 58.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07276313751935959, "kl": 0.01879236288368702, "learning_rate": 2.272666666666667e-06, "loss": 0.001, "num_tokens": 945876.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 58.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 6.651898384094238, "kl": 0.020251495763659477, "learning_rate": 2.2723333333333337e-06, "loss": 0.1915, "num_tokens": 946146.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 58.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03362635523080826, "kl": 0.0059560188092291355, "learning_rate": 2.272e-06, "loss": 0.0003, "num_tokens": 946419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09272552281618118, "kl": 0.012921370565891266, "learning_rate": 2.271666666666667e-06, "loss": 0.0006, "num_tokens": 946689.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 59.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1340102106332779, "kl": 0.027555877342820168, "learning_rate": 2.271333333333333e-06, "loss": 0.0014, "num_tokens": 946964.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 59.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07986051589250565, "kl": 0.005420433357357979, "learning_rate": 2.271e-06, "loss": 0.0003, "num_tokens": 947262.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 59.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.027396192774176598, "kl": 0.00856949482113123, "learning_rate": 2.2706666666666667e-06, "loss": 0.0004, "num_tokens": 947581.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 59.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.002769162179902196, "kl": 0.01630319282412529, "learning_rate": 2.2703333333333335e-06, "loss": 0.0008, "num_tokens": 947841.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 59.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 7.268100738525391, "kl": 0.008197366842068732, "learning_rate": 2.2700000000000003e-06, "loss": 0.0559, "num_tokens": 948170.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 59.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 1.8832625150680542, "kl": 0.30652645975351334, "learning_rate": 2.2696666666666666e-06, "loss": -0.0451, "num_tokens": 948535.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 59.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.012851247563958168, "kl": 0.004570577992126346, "learning_rate": 2.2693333333333334e-06, "loss": 0.0002, "num_tokens": 948825.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 59.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.040948763489723206, "kl": 0.10290932282805443, "learning_rate": 2.269e-06, "loss": 0.0051, "num_tokens": 949193.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.006165982689708471, "kl": 0.0004742443561553955, "learning_rate": 2.2686666666666666e-06, "loss": 0.0, "num_tokens": 949453.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 59.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.016570178791880608, "kl": 0.0006027974013704807, "learning_rate": 2.2683333333333334e-06, "loss": 0.0, "num_tokens": 949687.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 59.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.13422216475009918, "kl": 0.010207831393927336, "learning_rate": 2.268e-06, "loss": 0.0005, "num_tokens": 950013.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 59.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.17172186076641083, "kl": 0.03238068986684084, "learning_rate": 2.267666666666667e-06, "loss": 0.0017, "num_tokens": 950302.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 59.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.040401242673397064, "kl": 0.0015272833406925201, "learning_rate": 2.2673333333333333e-06, "loss": 0.0001, "num_tokens": 950562.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 59.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.007999102585017681, "kl": 0.002675756812095642, "learning_rate": 2.267e-06, "loss": 0.0001, "num_tokens": 950778.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 59.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.01074859406799078, "kl": 0.0001471191644668579, "learning_rate": 2.266666666666667e-06, "loss": 0.0, "num_tokens": 950990.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 59.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.44013193249702454, "kl": 0.04921358870342374, "learning_rate": 2.2663333333333336e-06, "loss": 0.0026, "num_tokens": 951280.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 59.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.07076862454414368, "kl": 0.034902628511190414, "learning_rate": 2.266e-06, "loss": 0.0017, "num_tokens": 951584.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 59.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0355362631380558, "kl": 0.0015573574928566813, "learning_rate": 2.2656666666666668e-06, "loss": 0.0001, "num_tokens": 951808.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 59.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.03428663685917854, "kl": 0.15743301808834076, "learning_rate": 2.265333333333333e-06, "loss": 0.0079, "num_tokens": 952117.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 59.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.033244941383600235, "kl": 0.00034081190824508667, "learning_rate": 2.265e-06, "loss": 0.0, "num_tokens": 952329.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 59.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.02193329483270645, "kl": 0.0031790193170309067, "learning_rate": 2.2646666666666667e-06, "loss": 0.0002, "num_tokens": 952641.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 59.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003524729108903557, "kl": 2.9958784580230713e-05, "learning_rate": 2.2643333333333335e-06, "loss": 0.0, "num_tokens": 952861.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 59.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 2.1891849040985107, "kl": 0.04719813913106918, "learning_rate": 2.2640000000000003e-06, "loss": 0.0263, "num_tokens": 953190.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 59.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04561041295528412, "kl": 0.006037738639861345, "learning_rate": 2.2636666666666666e-06, "loss": 0.0003, "num_tokens": 953472.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 59.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06946126371622086, "kl": 0.02700081653892994, "learning_rate": 2.2633333333333334e-06, "loss": 0.0014, "num_tokens": 953831.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 59.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.053101733326911926, "kl": 0.002790637663565576, "learning_rate": 2.2629999999999998e-06, "loss": 0.0001, "num_tokens": 954100.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 59.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.026106122881174088, "kl": 0.051188673824071884, "learning_rate": 2.262666666666667e-06, "loss": 0.0026, "num_tokens": 954433.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 59.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.038653019815683365, "kl": 0.2619563937187195, "learning_rate": 2.2623333333333333e-06, "loss": 0.0131, "num_tokens": 954737.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 59.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.027862071990967, "kl": 0.07980869337916374, "learning_rate": 2.262e-06, "loss": -0.0413, "num_tokens": 955059.0, "reward": 3.875, "reward_std": 2.688710927963257, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 2.688710927963257, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 59.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.058128684759140015, "kl": 0.01026546536013484, "learning_rate": 2.261666666666667e-06, "loss": 0.0005, "num_tokens": 955383.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051583899185061455, "kl": 0.00016373396283597685, "learning_rate": 2.2613333333333333e-06, "loss": 0.0, "num_tokens": 955639.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 59.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.08959463238716125, "kl": 0.023373776115477085, "learning_rate": 2.261e-06, "loss": 0.0012, "num_tokens": 955979.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 59.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 4.224197864532471, "kl": 0.022706760093569756, "learning_rate": 2.260666666666667e-06, "loss": 0.0558, "num_tokens": 956288.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 59.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 2.607232093811035, "kl": 0.19994370639324188, "learning_rate": 2.2603333333333336e-06, "loss": -0.0326, "num_tokens": 956687.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 59.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.01402705255895853, "kl": 0.00022174417972564697, "learning_rate": 2.26e-06, "loss": 0.0, "num_tokens": 956891.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 59.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009804409928619862, "kl": 0.0011819813516922295, "learning_rate": 2.2596666666666667e-06, "loss": 0.0001, "num_tokens": 957171.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 59.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.09625894576311111, "kl": 0.014301734045147896, "learning_rate": 2.259333333333333e-06, "loss": 0.0008, "num_tokens": 957455.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 59.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.031403396278619766, "kl": 0.0014009957667440176, "learning_rate": 2.259e-06, "loss": 0.0001, "num_tokens": 957760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.1456184685230255, "kl": 0.011133139487355947, "learning_rate": 2.2586666666666667e-06, "loss": 0.0005, "num_tokens": 958034.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 59.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.14911921322345734, "kl": 0.06302820518612862, "learning_rate": 2.2583333333333335e-06, "loss": 0.0032, "num_tokens": 958379.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3226 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.009615384973585606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 59.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.399496555328369, "kl": 0.08771447464823723, "learning_rate": 2.2580000000000002e-06, "loss": 0.0873, "num_tokens": 958701.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 59.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.04860496520996094, "kl": 0.008628547424450517, "learning_rate": 2.2576666666666666e-06, "loss": 0.0004, "num_tokens": 959007.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.529590368270874, "kl": 0.09773006429895759, "learning_rate": 2.2573333333333334e-06, "loss": 0.0008, "num_tokens": 959267.0, "reward": 2.0, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 1.7320507764816284, "step": 3229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 59.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.03229386731982231, "kl": 0.007061955519020557, "learning_rate": 2.257e-06, "loss": 0.0004, "num_tokens": 959572.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 59.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0624580979347229, "kl": 0.012530050007626414, "learning_rate": 2.256666666666667e-06, "loss": 0.0007, "num_tokens": 959858.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 59.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0475514717400074, "kl": 0.0035444003297016025, "learning_rate": 2.2563333333333333e-06, "loss": 0.0002, "num_tokens": 960124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 59.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.033293817192316055, "kl": 0.007924523204565048, "learning_rate": 2.256e-06, "loss": 0.0004, "num_tokens": 960398.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 59.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.033901579678058624, "kl": 0.0005332790315151215, "learning_rate": 2.255666666666667e-06, "loss": 0.0, "num_tokens": 960642.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 59.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.002517654560506344, "kl": 0.0033646076917648315, "learning_rate": 2.2553333333333332e-06, "loss": 0.0002, "num_tokens": 960878.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.21926459670066833, "kl": 0.015592428855597973, "learning_rate": 2.255e-06, "loss": 0.0008, "num_tokens": 961146.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 59.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.7518670558929443, "kl": 0.022783292457461357, "learning_rate": 2.254666666666667e-06, "loss": 0.141, "num_tokens": 961431.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 59.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02968648634850979, "kl": 0.004818763351067901, "learning_rate": 2.2543333333333336e-06, "loss": 0.0002, "num_tokens": 961706.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.026802120730280876, "kl": 0.0010022903443314135, "learning_rate": 2.254e-06, "loss": 0.0, "num_tokens": 961976.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 60.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.027512365952134132, "kl": 0.0018479927675798535, "learning_rate": 2.2536666666666667e-06, "loss": 0.0001, "num_tokens": 962301.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 60.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.512545108795166, "kl": 0.0603331383317709, "learning_rate": 2.253333333333333e-06, "loss": 0.0792, "num_tokens": 962632.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 60.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.007467227056622505, "kl": 0.0010834246641024947, "learning_rate": 2.253e-06, "loss": 0.0001, "num_tokens": 962892.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 60.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.023858018219470978, "kl": 0.0012014210224151611, "learning_rate": 2.252666666666667e-06, "loss": 0.0001, "num_tokens": 963104.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3243 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.007352941203862429, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 60.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 2.4764111042022705, "kl": 0.035196663811802864, "learning_rate": 2.2523333333333334e-06, "loss": 0.0079, "num_tokens": 963515.0, "reward": 2.875, "reward_std": 0.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 0.25, "step": 3244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 7.319875717163086, "kl": 0.04025369882583618, "learning_rate": 2.252e-06, "loss": 0.3456, "num_tokens": 963825.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 60.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474195867776871, "kl": 0.005463895387947559, "learning_rate": 2.2516666666666666e-06, "loss": 0.0003, "num_tokens": 964101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 60.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.03988216072320938, "kl": 0.2616236209869385, "learning_rate": 2.2513333333333333e-06, "loss": 0.0131, "num_tokens": 964405.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 60.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.07153337448835373, "kl": 0.019581732340157032, "learning_rate": 2.251e-06, "loss": 0.001, "num_tokens": 964754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 60.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.002549492521211505, "kl": 0.003363586962223053, "learning_rate": 2.250666666666667e-06, "loss": 0.0002, "num_tokens": 964990.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 60.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.009882776066660881, "kl": 0.00037163496017456055, "learning_rate": 2.2503333333333333e-06, "loss": 0.0, "num_tokens": 965234.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.1212263107299805, "kl": 0.060780106112360954, "learning_rate": 2.25e-06, "loss": 0.0243, "num_tokens": 965525.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 3251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 60.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.05679375305771828, "kl": 0.017526951618492603, "learning_rate": 2.249666666666667e-06, "loss": 0.0007, "num_tokens": 965907.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 60.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04464339464902878, "kl": 0.002002660185098648, "learning_rate": 2.249333333333333e-06, "loss": 0.0001, "num_tokens": 966167.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 60.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.056290287524461746, "kl": 0.0019052649586228654, "learning_rate": 2.249e-06, "loss": 0.0001, "num_tokens": 966389.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 60.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.17134232819080353, "kl": 0.02606994565576315, "learning_rate": 2.2486666666666668e-06, "loss": 0.0013, "num_tokens": 966659.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 60.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 1.3570164442062378, "kl": 0.1328824907541275, "learning_rate": 2.2483333333333335e-06, "loss": 0.0059, "num_tokens": 966889.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 60.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.425131320953369, "kl": 0.02823589649051428, "learning_rate": 2.248e-06, "loss": 0.0956, "num_tokens": 967183.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 60.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.013110277242958546, "kl": 0.004298093728721142, "learning_rate": 2.2476666666666667e-06, "loss": 0.0002, "num_tokens": 967456.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 60.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.011147416196763515, "kl": 0.0010892586433328688, "learning_rate": 2.247333333333333e-06, "loss": 0.0001, "num_tokens": 967724.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 60.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.03502470627427101, "kl": 0.00041371583938598633, "learning_rate": 2.2470000000000003e-06, "loss": 0.0, "num_tokens": 967936.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 60.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.4776037931442261, "kl": 0.05135340057313442, "learning_rate": 2.246666666666667e-06, "loss": 0.0029, "num_tokens": 968214.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 60.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.04923976585268974, "kl": 0.15730884671211243, "learning_rate": 2.2463333333333334e-06, "loss": 0.0079, "num_tokens": 968523.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 60.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 3.1604881286621094, "kl": 0.06634473241865635, "learning_rate": 2.246e-06, "loss": 0.0439, "num_tokens": 968880.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09133684635162354, "kl": 0.011883596307598054, "learning_rate": 2.2456666666666665e-06, "loss": 0.0006, "num_tokens": 969151.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 60.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.002457198454067111, "kl": 0.01634630560874939, "learning_rate": 2.2453333333333333e-06, "loss": 0.0008, "num_tokens": 969411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3265 }, { "clip_ratio/high_max": 0.009433962404727936, "clip_ratio/high_mean": 0.009433962404727936, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009433962404727936, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 60.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.2356605529785156, "kl": 0.16095459461212158, "learning_rate": 2.245e-06, "loss": -0.0205, "num_tokens": 969768.0, "reward": 6.625, "reward_std": 2.428133726119995, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.428133726119995, "step": 3266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 60.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.3584771752357483, "kl": 0.061701999977231026, "learning_rate": 2.244666666666667e-06, "loss": 0.0039, "num_tokens": 970052.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07596871256828308, "kl": 0.012241472955793142, "learning_rate": 2.2443333333333332e-06, "loss": 0.0006, "num_tokens": 970336.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 60.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.10051976144313812, "kl": 0.03433547355234623, "learning_rate": 2.244e-06, "loss": 0.0017, "num_tokens": 970673.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 60.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003291579778306186, "kl": 6.13480806350708e-05, "learning_rate": 2.243666666666667e-06, "loss": 0.0, "num_tokens": 970893.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 60.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.18302927911281586, "kl": 0.030282115563750267, "learning_rate": 2.243333333333333e-06, "loss": 0.0014, "num_tokens": 971187.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 60.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.12607116997241974, "kl": 0.011203007772564888, "learning_rate": 2.243e-06, "loss": 0.0006, "num_tokens": 971515.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.041766345500946045, "kl": 0.005099525908008218, "learning_rate": 2.2426666666666667e-06, "loss": 0.0003, "num_tokens": 971799.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 60.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008860925445333123, "kl": 0.0012018127599731088, "learning_rate": 2.2423333333333335e-06, "loss": 0.0001, "num_tokens": 972079.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 60.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.034295372664928436, "kl": 0.05643720179796219, "learning_rate": 2.242e-06, "loss": 0.0028, "num_tokens": 972412.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 60.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.01388558465987444, "kl": 0.00814725086092949, "learning_rate": 2.2416666666666667e-06, "loss": 0.0004, "num_tokens": 972738.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.05313456058502197, "kl": 0.020086459815502167, "learning_rate": 2.2413333333333334e-06, "loss": 0.001, "num_tokens": 973009.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 60.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03039192594587803, "kl": 0.0003601104181143455, "learning_rate": 2.2410000000000002e-06, "loss": 0.0, "num_tokens": 973265.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 60.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.1424340009689331, "kl": 0.030056262388825417, "learning_rate": 2.240666666666667e-06, "loss": 0.0014, "num_tokens": 973567.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 60.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.021793438121676445, "kl": 0.0023846477270126343, "learning_rate": 2.2403333333333334e-06, "loss": 0.0001, "num_tokens": 973775.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 60.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.013799660839140415, "kl": 0.0026278942823410034, "learning_rate": 2.24e-06, "loss": 0.0001, "num_tokens": 974071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 60.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.00799830537289381, "kl": 0.00044744781916961074, "learning_rate": 2.2396666666666665e-06, "loss": 0.0, "num_tokens": 974307.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 60.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.34845107793807983, "kl": 0.03930492326617241, "learning_rate": 2.2393333333333333e-06, "loss": 0.0019, "num_tokens": 974648.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 60.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.10607807338237762, "kl": 0.008225849131122231, "learning_rate": 2.239e-06, "loss": 0.0004, "num_tokens": 974908.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 60.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732354000210762, "kl": 0.013455578591674566, "learning_rate": 2.238666666666667e-06, "loss": 0.0007, "num_tokens": 975238.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 60.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.08038192987442017, "kl": 0.010605439194478095, "learning_rate": 2.2383333333333332e-06, "loss": 0.0006, "num_tokens": 975570.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 60.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.09279978275299072, "kl": 0.08303552120923996, "learning_rate": 2.238e-06, "loss": 0.0042, "num_tokens": 975936.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 60.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 4.126615047454834, "kl": 0.06999421655200422, "learning_rate": 2.2376666666666668e-06, "loss": 0.0933, "num_tokens": 976235.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 60.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.022675970569252968, "kl": 0.0011477507650852203, "learning_rate": 2.237333333333333e-06, "loss": 0.0001, "num_tokens": 976547.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 60.925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 4.521695137023926, "kl": 0.052976700535509735, "learning_rate": 2.2370000000000004e-06, "loss": 0.0186, "num_tokens": 976865.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 60.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.36096513271331787, "kl": 0.030867554945871234, "learning_rate": 2.2366666666666667e-06, "loss": 0.0016, "num_tokens": 977174.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 60.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.9423773288726807, "kl": 0.026548580965027213, "learning_rate": 2.2363333333333335e-06, "loss": 0.0134, "num_tokens": 977480.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 60.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.055974461138248444, "kl": 0.0312656294554472, "learning_rate": 2.236e-06, "loss": 0.0016, "num_tokens": 977778.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 61.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.07440923154354095, "kl": 0.005075130713521503, "learning_rate": 2.2356666666666666e-06, "loss": 0.0002, "num_tokens": 978087.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 61.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.14330555498600006, "kl": 0.019268010277301073, "learning_rate": 2.2353333333333334e-06, "loss": 0.001, "num_tokens": 978419.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 61.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.1167730987071991, "kl": 0.012032601051032543, "learning_rate": 2.235e-06, "loss": 0.0006, "num_tokens": 978695.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 61.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.022528642788529396, "kl": 0.0007245764136314392, "learning_rate": 2.234666666666667e-06, "loss": 0.0, "num_tokens": 978955.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 61.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.015033102594316006, "kl": 0.0042897912207990885, "learning_rate": 2.2343333333333333e-06, "loss": 0.0002, "num_tokens": 979228.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 61.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.059593863785266876, "kl": 0.027693637646734715, "learning_rate": 2.234e-06, "loss": 0.0015, "num_tokens": 979517.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 61.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.019043680280447006, "kl": 0.04799039848148823, "learning_rate": 2.2336666666666665e-06, "loss": 0.0024, "num_tokens": 979849.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 61.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.016476990655064583, "kl": 0.00288031913805753, "learning_rate": 2.2333333333333333e-06, "loss": 0.0002, "num_tokens": 980151.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 61.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.04146501421928406, "kl": 0.2614165246486664, "learning_rate": 2.233e-06, "loss": 0.0131, "num_tokens": 980455.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 61.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.04730117693543434, "kl": 0.01096729189157486, "learning_rate": 2.232666666666667e-06, "loss": 0.0005, "num_tokens": 980754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 61.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.11177375912666321, "kl": 0.011237940285354853, "learning_rate": 2.232333333333333e-06, "loss": 0.0006, "num_tokens": 981015.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 61.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.004694729577749968, "kl": 0.0006412813963834196, "learning_rate": 2.232e-06, "loss": 0.0, "num_tokens": 981231.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.08639340102672577, "kl": 0.004216181579977274, "learning_rate": 2.2316666666666668e-06, "loss": 0.0002, "num_tokens": 981452.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 61.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 1.2088747024536133, "kl": 0.16476541478186846, "learning_rate": 2.2313333333333335e-06, "loss": 0.0078, "num_tokens": 981729.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 61.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.663725852966309, "kl": 0.025627458177041262, "learning_rate": 2.2310000000000003e-06, "loss": 0.0955, "num_tokens": 982010.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 61.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.01842394471168518, "kl": 0.00127878796774894, "learning_rate": 2.2306666666666667e-06, "loss": 0.0001, "num_tokens": 982333.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 61.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03290437534451485, "kl": 0.0007133185972634237, "learning_rate": 2.2303333333333335e-06, "loss": 0.0, "num_tokens": 982589.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 61.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.09576458483934402, "kl": 0.015226204879581928, "learning_rate": 2.23e-06, "loss": 0.0008, "num_tokens": 982877.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 61.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.2942647933959961, "kl": 0.05347330495715141, "learning_rate": 2.2296666666666666e-06, "loss": 0.0028, "num_tokens": 983205.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.005737815517932177, "kl": 0.0014646127820014954, "learning_rate": 2.2293333333333334e-06, "loss": 0.0001, "num_tokens": 983421.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 61.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.020061105489730835, "kl": 0.0006044578040018678, "learning_rate": 2.229e-06, "loss": 0.0, "num_tokens": 983656.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 61.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 3.012406587600708, "kl": 0.06878912821412086, "learning_rate": 2.228666666666667e-06, "loss": -0.0222, "num_tokens": 984025.0, "reward": 4.875, "reward_std": 3.75, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.75, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 61.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0343567430973053, "kl": 0.006915203528478742, "learning_rate": 2.2283333333333333e-06, "loss": 0.0003, "num_tokens": 984293.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 61.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.12580174207687378, "kl": 0.028369064209982753, "learning_rate": 2.228e-06, "loss": 0.0013, "num_tokens": 984615.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 61.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.018884699791669846, "kl": 0.000634184674709104, "learning_rate": 2.2276666666666665e-06, "loss": 0.0, "num_tokens": 984928.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 61.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006954753189347684, "kl": 0.0012185658561065793, "learning_rate": 2.2273333333333332e-06, "loss": 0.0001, "num_tokens": 985208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003398414410185069, "kl": 5.6333839893341064e-05, "learning_rate": 2.227e-06, "loss": 0.0, "num_tokens": 985428.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 84.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 84.0, "completions/mean_terminated_length": 26.666667938232422, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 61.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.566087484359741, "kl": 0.04127374291419983, "learning_rate": 2.226666666666667e-06, "loss": 0.5316, "num_tokens": 986016.0, "reward": 7.175000190734863, "reward_std": 0.39475739002227783, "rewards/reward_combined/mean": 7.175000190734863, "rewards/reward_combined/std": 0.39475739002227783, "step": 3321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 61.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.018250636756420135, "kl": 0.002339041791856289, "learning_rate": 2.2263333333333336e-06, "loss": 0.0001, "num_tokens": 986328.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.031394533812999725, "kl": 0.00028486549854278564, "learning_rate": 2.226e-06, "loss": 0.0, "num_tokens": 986540.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 61.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03656277433037758, "kl": 0.0016409651725552976, "learning_rate": 2.2256666666666667e-06, "loss": 0.0001, "num_tokens": 986807.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 61.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 4.334615707397461, "kl": 0.028304174542427063, "learning_rate": 2.2253333333333335e-06, "loss": -0.0167, "num_tokens": 987105.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022786613553762436, "kl": 0.0033953189849853516, "learning_rate": 2.2250000000000003e-06, "loss": 0.0002, "num_tokens": 987341.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 61.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.02110440470278263, "kl": 0.0031103537185117602, "learning_rate": 2.2246666666666667e-06, "loss": 0.0002, "num_tokens": 987623.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3327 }, { "clip_ratio/high_max": 0.012195121496915817, "clip_ratio/high_mean": 0.012195121496915817, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012195121496915817, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 61.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 4.281612873077393, "kl": 0.05813291110098362, "learning_rate": 2.2243333333333334e-06, "loss": 0.0149, "num_tokens": 987924.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 61.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 2.2800800800323486, "kl": 0.02919972687959671, "learning_rate": 2.224e-06, "loss": 0.0681, "num_tokens": 988299.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 61.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041159335523843765, "kl": 0.016093909740447998, "learning_rate": 2.2236666666666666e-06, "loss": 0.0008, "num_tokens": 988559.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 61.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.040952134877443314, "kl": 0.00654768873937428, "learning_rate": 2.2233333333333334e-06, "loss": 0.0003, "num_tokens": 988880.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 61.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.4205713272094727, "kl": 0.027768907137215137, "learning_rate": 2.223e-06, "loss": 0.225, "num_tokens": 989248.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 3332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 61.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.05844840779900551, "kl": 0.15637709200382233, "learning_rate": 2.222666666666667e-06, "loss": 0.0078, "num_tokens": 989557.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 61.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.12842020392417908, "kl": 0.009622111800126731, "learning_rate": 2.2223333333333333e-06, "loss": 0.0005, "num_tokens": 989847.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 61.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01615673303604126, "kl": 0.0009997934103012085, "learning_rate": 2.222e-06, "loss": 0.0, "num_tokens": 990107.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 61.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.7181059122085571, "kl": 0.04054766148328781, "learning_rate": 2.2216666666666664e-06, "loss": 0.0018, "num_tokens": 990371.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 61.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06952530890703201, "kl": 0.006099112331867218, "learning_rate": 2.2213333333333336e-06, "loss": 0.0003, "num_tokens": 990615.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 61.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 1.1310569047927856, "kl": 0.04693709872663021, "learning_rate": 2.221e-06, "loss": -0.0128, "num_tokens": 991028.0, "reward": 2.875, "reward_std": 0.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 0.25, "step": 3338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 61.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.03884014114737511, "kl": 0.10007043182849884, "learning_rate": 2.2206666666666668e-06, "loss": 0.0051, "num_tokens": 991396.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 61.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 3.2800207138061523, "kl": 0.07188292033970356, "learning_rate": 2.2203333333333336e-06, "loss": -0.0054, "num_tokens": 991732.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 61.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.05696495622396469, "kl": 0.011853685136884451, "learning_rate": 2.22e-06, "loss": 0.0006, "num_tokens": 992022.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 61.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.061937589198350906, "kl": 0.005190921947360039, "learning_rate": 2.2196666666666667e-06, "loss": 0.0003, "num_tokens": 992320.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 61.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.13253596425056458, "kl": 0.016126180300489068, "learning_rate": 2.2193333333333335e-06, "loss": 0.0009, "num_tokens": 992664.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 61.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.051882702857255936, "kl": 0.0064075172413140535, "learning_rate": 2.2190000000000003e-06, "loss": 0.0003, "num_tokens": 992984.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 61.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.051254939287900925, "kl": 0.012020382098853588, "learning_rate": 2.2186666666666666e-06, "loss": 0.0006, "num_tokens": 993397.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 61.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07140407711267471, "kl": 0.0061962848994880915, "learning_rate": 2.2183333333333334e-06, "loss": 0.0003, "num_tokens": 993659.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 61.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 7.329586982727051, "kl": 0.03178100101649761, "learning_rate": 2.2179999999999998e-06, "loss": 0.3282, "num_tokens": 993949.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 3347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 62.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01830742135643959, "kl": 0.0025819912552833557, "learning_rate": 2.2176666666666666e-06, "loss": 0.0002, "num_tokens": 994157.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 62.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06955549120903015, "kl": 0.008563360664993525, "learning_rate": 2.2173333333333333e-06, "loss": 0.0004, "num_tokens": 994431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 62.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09347642958164215, "kl": 0.014489857479929924, "learning_rate": 2.217e-06, "loss": 0.0007, "num_tokens": 994767.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 62.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05998816713690758, "kl": 0.0013189032906666398, "learning_rate": 2.216666666666667e-06, "loss": 0.0001, "num_tokens": 994980.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 62.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.04640239477157593, "kl": 0.0019155815825797617, "learning_rate": 2.2163333333333333e-06, "loss": 0.0001, "num_tokens": 995248.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 62.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.06852405518293381, "kl": 0.004565507173538208, "learning_rate": 2.216e-06, "loss": 0.0002, "num_tokens": 995464.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 62.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 4.153611660003662, "kl": 0.08314665406942368, "learning_rate": 2.215666666666667e-06, "loss": 0.045, "num_tokens": 995760.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 62.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.006426460109651089, "kl": 0.0003775298537220806, "learning_rate": 2.2153333333333336e-06, "loss": 0.0, "num_tokens": 995980.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 62.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.12019623816013336, "kl": 0.041127199307084084, "learning_rate": 2.215e-06, "loss": 0.0021, "num_tokens": 996309.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.037281282246112823, "kl": 0.004518923815339804, "learning_rate": 2.2146666666666668e-06, "loss": 0.0002, "num_tokens": 996593.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 62.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.07549259066581726, "kl": 0.028739141300320625, "learning_rate": 2.2143333333333335e-06, "loss": 0.0014, "num_tokens": 996930.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 62.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.027317391708493233, "kl": 0.003621056559495628, "learning_rate": 2.214e-06, "loss": 0.0002, "num_tokens": 997190.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.06416074931621552, "kl": 0.004746583057567477, "learning_rate": 2.2136666666666667e-06, "loss": 0.0002, "num_tokens": 997490.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 62.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06248186156153679, "kl": 0.012136355508118868, "learning_rate": 2.2133333333333335e-06, "loss": 0.0006, "num_tokens": 997793.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 62.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.9482815265655518, "kl": 0.05561781022697687, "learning_rate": 2.2130000000000002e-06, "loss": -0.0597, "num_tokens": 998120.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 62.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 5.304361343383789, "kl": 0.010267814621329308, "learning_rate": 2.2126666666666666e-06, "loss": 0.2741, "num_tokens": 998399.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 62.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08255226910114288, "kl": 0.004989365988876671, "learning_rate": 2.2123333333333334e-06, "loss": 0.0003, "num_tokens": 998721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 62.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.05197465792298317, "kl": 0.012701776344329119, "learning_rate": 2.2119999999999997e-06, "loss": 0.0005, "num_tokens": 999103.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.1006372720003128, "kl": 0.018497135490179062, "learning_rate": 2.2116666666666665e-06, "loss": 0.001, "num_tokens": 999390.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 62.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.11387915909290314, "kl": 0.013417403679341078, "learning_rate": 2.2113333333333337e-06, "loss": 0.0007, "num_tokens": 999680.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 62.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.024978553876280785, "kl": 0.003375243606569711, "learning_rate": 2.211e-06, "loss": 0.0002, "num_tokens": 999944.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 62.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.06752584129571915, "kl": 0.004735873662866652, "learning_rate": 2.210666666666667e-06, "loss": 0.0002, "num_tokens": 1000242.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 62.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.028170356526970863, "kl": 0.005824573803693056, "learning_rate": 2.2103333333333332e-06, "loss": 0.0003, "num_tokens": 1000510.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 9.58154582977295, "kl": 0.09674317017197609, "learning_rate": 2.21e-06, "loss": 0.1458, "num_tokens": 1000804.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 62.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 1.5221912860870361, "kl": 0.29760829266160727, "learning_rate": 2.209666666666667e-06, "loss": 0.0141, "num_tokens": 1001063.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 62.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.13651998341083527, "kl": 0.021502234041690826, "learning_rate": 2.2093333333333336e-06, "loss": 0.0011, "num_tokens": 1001345.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 62.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05915094539523125, "kl": 0.005745707137975842, "learning_rate": 2.209e-06, "loss": 0.0003, "num_tokens": 1001672.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 62.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.4944217205047607, "kl": 0.04194041155278683, "learning_rate": 2.2086666666666667e-06, "loss": 0.128, "num_tokens": 1002037.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 62.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0517662949860096, "kl": 0.14405743777751923, "learning_rate": 2.2083333333333335e-06, "loss": 0.0072, "num_tokens": 1002350.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 62.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.030326293781399727, "kl": 0.03141090925782919, "learning_rate": 2.208e-06, "loss": 0.0016, "num_tokens": 1002755.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 62.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.027798175811767578, "kl": 0.004956083721481264, "learning_rate": 2.2076666666666666e-06, "loss": 0.0003, "num_tokens": 1003078.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 62.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 3.0506179332733154, "kl": 0.30602581799030304, "learning_rate": 2.2073333333333334e-06, "loss": 0.0154, "num_tokens": 1003382.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 62.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 3.3334875106811523, "kl": 0.28185568004846573, "learning_rate": 2.2070000000000002e-06, "loss": 0.0218, "num_tokens": 1003715.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 62.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.13829700648784637, "kl": 0.11752831935882568, "learning_rate": 2.2066666666666666e-06, "loss": 0.0059, "num_tokens": 1004083.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 62.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.012094398960471153, "kl": 0.0006010000070091337, "learning_rate": 2.2063333333333334e-06, "loss": 0.0, "num_tokens": 1004318.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 62.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 1.11672842502594, "kl": 0.046068258583545685, "learning_rate": 2.2059999999999997e-06, "loss": 0.0026, "num_tokens": 1004528.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 62.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.7228609919548035, "kl": 0.1081484891474247, "learning_rate": 2.205666666666667e-06, "loss": 0.0055, "num_tokens": 1004910.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 62.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.007040159311145544, "kl": 0.00012621581845451146, "learning_rate": 2.2053333333333337e-06, "loss": 0.0, "num_tokens": 1005166.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1098102331161499, "kl": 0.01248421985656023, "learning_rate": 2.205e-06, "loss": 0.0006, "num_tokens": 1005438.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 62.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.003340615890920162, "kl": 0.0016763443127274513, "learning_rate": 2.204666666666667e-06, "loss": 0.0001, "num_tokens": 1005750.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 62.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00031305052107200027, "kl": 6.771087646484375e-05, "learning_rate": 2.204333333333333e-06, "loss": 0.0, "num_tokens": 1005970.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 62.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.028128478676080704, "kl": 0.0009418537665624171, "learning_rate": 2.204e-06, "loss": 0.0, "num_tokens": 1006234.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 62.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.6902763843536377, "kl": 0.1394881308078766, "learning_rate": 2.2036666666666668e-06, "loss": 0.0069, "num_tokens": 1006607.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 62.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 2.3452072143554688, "kl": 0.3022227343171835, "learning_rate": 2.2033333333333336e-06, "loss": 0.0169, "num_tokens": 1006934.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.032515645027160645, "kl": 0.0014913790510036051, "learning_rate": 2.203e-06, "loss": 0.0001, "num_tokens": 1007201.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3392 }, { "clip_ratio/high_max": 0.008064515888690948, "clip_ratio/high_mean": 0.008064515888690948, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008064515888690948, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 62.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 4.496065616607666, "kl": 0.1216975748538971, "learning_rate": 2.2026666666666667e-06, "loss": -0.1729, "num_tokens": 1007525.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 62.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.32536420226097107, "kl": 0.04868849087506533, "learning_rate": 2.2023333333333335e-06, "loss": 0.0031, "num_tokens": 1007809.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 62.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.05128289386630058, "kl": 0.010341319721192122, "learning_rate": 2.202e-06, "loss": 0.0005, "num_tokens": 1008095.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 62.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 4.608768463134766, "kl": 0.05090206302702427, "learning_rate": 2.2016666666666666e-06, "loss": 0.0705, "num_tokens": 1008437.0, "reward": 2.375, "reward_std": 2.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 2.25, "step": 3396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 62.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.04566143453121185, "kl": 0.002851595403626561, "learning_rate": 2.2013333333333334e-06, "loss": 0.0001, "num_tokens": 1008680.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 62.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007186815491877496, "kl": 0.0012371200136840343, "learning_rate": 2.201e-06, "loss": 0.0001, "num_tokens": 1008960.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 62.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07954184710979462, "kl": 0.001954585313796997, "learning_rate": 2.2006666666666665e-06, "loss": 0.0001, "num_tokens": 1009176.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 62.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022725940216332674, "kl": 0.00338079035282135, "learning_rate": 2.2003333333333333e-06, "loss": 0.0002, "num_tokens": 1009412.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012195121496915817, "clip_ratio/low_min": 0.012195121496915817, "clip_ratio/region_mean": 0.012195121496915817, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 62.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.9990084171295166, "kl": 0.03532938752323389, "learning_rate": 2.1999999999999997e-06, "loss": 0.0991, "num_tokens": 1009726.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 3401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 63.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.7755279541015625, "kl": 0.05305712204426527, "learning_rate": 2.199666666666667e-06, "loss": -0.0979, "num_tokens": 1010011.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 63.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1649562120437622, "kl": 0.01902078534476459, "learning_rate": 2.1993333333333337e-06, "loss": 0.001, "num_tokens": 1010349.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 63.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024097179993987083, "kl": 0.003337077796459198, "learning_rate": 2.199e-06, "loss": 0.0002, "num_tokens": 1010585.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 63.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02678484283387661, "kl": 0.0006867200136184692, "learning_rate": 2.198666666666667e-06, "loss": 0.0, "num_tokens": 1010853.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 63.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 3.996821165084839, "kl": 0.2182103544473648, "learning_rate": 2.198333333333333e-06, "loss": 0.0521, "num_tokens": 1011154.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 63.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014491081237792969, "kl": 0.00044173747301101685, "learning_rate": 2.198e-06, "loss": 0.0, "num_tokens": 1011414.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 63.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 2.6314876079559326, "kl": 0.040607634000480175, "learning_rate": 2.1976666666666667e-06, "loss": -0.03, "num_tokens": 1011765.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 63.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.04953719675540924, "kl": 0.005309153348207474, "learning_rate": 2.1973333333333335e-06, "loss": 0.0003, "num_tokens": 1012094.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 63.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003084309573750943, "kl": 7.05718994140625e-05, "learning_rate": 2.197e-06, "loss": 0.0, "num_tokens": 1012314.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 63.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.11731244623661041, "kl": 0.012788111809641123, "learning_rate": 2.1966666666666667e-06, "loss": 0.0006, "num_tokens": 1012609.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 63.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 4.53308629989624, "kl": 0.05539463832974434, "learning_rate": 2.1963333333333335e-06, "loss": -0.0533, "num_tokens": 1012938.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 63.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.813312530517578, "kl": 0.09071268513798714, "learning_rate": 2.196e-06, "loss": 0.0086, "num_tokens": 1013255.0, "reward": 2.25, "reward_std": 2.020725965499878, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 2.020725965499878, "step": 3413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 63.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.053565025329589844, "kl": 0.023334696888923645, "learning_rate": 2.195666666666667e-06, "loss": 0.0012, "num_tokens": 1013600.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 63.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.8939841985702515, "kl": 0.08374106511473656, "learning_rate": 2.1953333333333334e-06, "loss": 0.0413, "num_tokens": 1013937.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 63.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.026571355760097504, "kl": 0.0007585063576698303, "learning_rate": 2.195e-06, "loss": 0.0, "num_tokens": 1014181.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 63.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.005973426625132561, "kl": 0.0002433204062981531, "learning_rate": 2.1946666666666665e-06, "loss": 0.0, "num_tokens": 1014497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 63.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.09025933593511581, "kl": 0.005301809869706631, "learning_rate": 2.1943333333333333e-06, "loss": 0.0003, "num_tokens": 1014719.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 63.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.36134329438209534, "kl": 0.029291070997714996, "learning_rate": 2.194e-06, "loss": 0.002, "num_tokens": 1014927.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 63.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.17901213467121124, "kl": 0.03326728194952011, "learning_rate": 2.193666666666667e-06, "loss": 0.0017, "num_tokens": 1015258.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 63.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 5.0757012367248535, "kl": 0.26542486995458603, "learning_rate": 2.1933333333333337e-06, "loss": 0.342, "num_tokens": 1015588.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 3421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 63.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 7.303065299987793, "kl": 0.681626558303833, "learning_rate": 2.193e-06, "loss": -0.1514, "num_tokens": 1015843.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 3422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 63.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.006917375605553389, "kl": 0.2675882875919342, "learning_rate": 2.192666666666667e-06, "loss": 0.0134, "num_tokens": 1016147.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 63.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.15333710610866547, "kl": 0.004445262253284454, "learning_rate": 2.192333333333333e-06, "loss": 0.0002, "num_tokens": 1016359.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 63.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.04257291927933693, "kl": 0.01012834021821618, "learning_rate": 2.192e-06, "loss": 0.0005, "num_tokens": 1016664.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 63.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.031284891068935394, "kl": 0.12643224373459816, "learning_rate": 2.1916666666666667e-06, "loss": 0.0064, "num_tokens": 1017034.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 63.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008705061045475304, "kl": 0.001197568024508655, "learning_rate": 2.1913333333333335e-06, "loss": 0.0001, "num_tokens": 1017314.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 63.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.005729298572987318, "kl": 0.0014186277985572815, "learning_rate": 2.191e-06, "loss": 0.0001, "num_tokens": 1017530.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 63.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.176250457763672, "kl": 0.028239657171070576, "learning_rate": 2.1906666666666666e-06, "loss": 0.1668, "num_tokens": 1017815.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 63.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.11604803800582886, "kl": 0.013675297028385103, "learning_rate": 2.1903333333333334e-06, "loss": 0.0007, "num_tokens": 1018141.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 63.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.013029688969254494, "kl": 0.0003342479467391968, "learning_rate": 2.19e-06, "loss": 0.0, "num_tokens": 1018353.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 63.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05150968208909035, "kl": 0.0017892661562655121, "learning_rate": 2.189666666666667e-06, "loss": 0.0001, "num_tokens": 1018586.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 63.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 1.2737945318222046, "kl": 0.07921257987618446, "learning_rate": 2.1893333333333334e-06, "loss": 0.0277, "num_tokens": 1018993.0, "reward": 2.875, "reward_std": 0.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 0.25, "step": 3433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 63.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.008199530653655529, "kl": 0.001806585118174553, "learning_rate": 2.189e-06, "loss": 0.0001, "num_tokens": 1019305.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 63.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.02698396146297455, "kl": 0.0057526868768036366, "learning_rate": 2.1886666666666665e-06, "loss": 0.0003, "num_tokens": 1019583.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 63.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.04575595632195473, "kl": 0.013479504734277725, "learning_rate": 2.1883333333333333e-06, "loss": 0.0006, "num_tokens": 1019965.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 63.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.18682636320590973, "kl": 0.01179274870082736, "learning_rate": 2.188e-06, "loss": 0.0006, "num_tokens": 1020263.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 63.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.009331322275102139, "kl": 0.014926896896213293, "learning_rate": 2.187666666666667e-06, "loss": 0.0007, "num_tokens": 1020523.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 63.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.02341627888381481, "kl": 0.006356117781251669, "learning_rate": 2.1873333333333336e-06, "loss": 0.0003, "num_tokens": 1020811.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 63.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.053721267729997635, "kl": 0.004764894372783601, "learning_rate": 2.187e-06, "loss": 0.0003, "num_tokens": 1021082.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 63.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.549738645553589, "kl": 0.14344819635152817, "learning_rate": 2.1866666666666668e-06, "loss": 0.0602, "num_tokens": 1021432.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 63.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01474683452397585, "kl": 0.0010590986930765212, "learning_rate": 2.186333333333333e-06, "loss": 0.0001, "num_tokens": 1021700.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 63.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.11663239449262619, "kl": 0.041027034632861614, "learning_rate": 2.186e-06, "loss": 0.0021, "num_tokens": 1021990.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 63.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.997528076171875, "kl": 0.04900665208697319, "learning_rate": 2.1856666666666667e-06, "loss": -0.066, "num_tokens": 1022328.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 63.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.033921096473932266, "kl": 0.0043184710666537285, "learning_rate": 2.1853333333333335e-06, "loss": 0.0002, "num_tokens": 1022612.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 63.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.03759907931089401, "kl": 0.002453757624607533, "learning_rate": 2.1850000000000003e-06, "loss": 0.0001, "num_tokens": 1022914.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 63.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0442030169069767, "kl": 0.15268483757972717, "learning_rate": 2.1846666666666666e-06, "loss": 0.0076, "num_tokens": 1023223.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 63.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.059195198118686676, "kl": 0.009379944764077663, "learning_rate": 2.1843333333333334e-06, "loss": 0.0005, "num_tokens": 1023497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 63.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.03964879363775253, "kl": 0.007741866167634726, "learning_rate": 2.184e-06, "loss": 0.0004, "num_tokens": 1023792.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 63.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 2.095860481262207, "kl": 0.023096129298210144, "learning_rate": 2.183666666666667e-06, "loss": 0.0493, "num_tokens": 1024148.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 63.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.3290202617645264, "kl": 0.04408697225153446, "learning_rate": 2.1833333333333333e-06, "loss": 0.0614, "num_tokens": 1024439.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 63.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.263012558221817, "kl": 0.061270684003829956, "learning_rate": 2.183e-06, "loss": 0.0031, "num_tokens": 1024735.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 63.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.034099407494068146, "kl": 0.0060203049797564745, "learning_rate": 2.1826666666666665e-06, "loss": 0.0003, "num_tokens": 1025003.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 63.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.044186048209667206, "kl": 0.0025317840045318007, "learning_rate": 2.1823333333333332e-06, "loss": 0.0001, "num_tokens": 1025274.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 63.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09788111597299576, "kl": 0.003438621759414673, "learning_rate": 2.182e-06, "loss": 0.0002, "num_tokens": 1025530.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 64.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.007005822379142046, "kl": 0.0006105720822233707, "learning_rate": 2.181666666666667e-06, "loss": 0.0, "num_tokens": 1025790.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 64.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02793942205607891, "kl": 0.12566199526190758, "learning_rate": 2.1813333333333336e-06, "loss": 0.0063, "num_tokens": 1026160.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 64.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 9.262382507324219, "kl": 1.774953931570053, "learning_rate": 2.181e-06, "loss": 0.1041, "num_tokens": 1026465.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 64.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.15645377337932587, "kl": 0.02274109423160553, "learning_rate": 2.1806666666666667e-06, "loss": 0.0012, "num_tokens": 1026754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 64.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03744257614016533, "kl": 0.0033129056682810187, "learning_rate": 2.180333333333333e-06, "loss": 0.0001, "num_tokens": 1027016.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 64.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04662017151713371, "kl": 0.0016747861409385223, "learning_rate": 2.1800000000000003e-06, "loss": 0.0001, "num_tokens": 1027272.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 64.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0083814337849617, "kl": 0.0016384795308113098, "learning_rate": 2.1796666666666667e-06, "loss": 0.0001, "num_tokens": 1027584.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00520468782633543, "kl": 0.000131264328956604, "learning_rate": 2.1793333333333334e-06, "loss": 0.0, "num_tokens": 1027796.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 64.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.279834508895874, "kl": 0.03943028347566724, "learning_rate": 2.1790000000000002e-06, "loss": 0.0024, "num_tokens": 1028078.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 64.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.3873960971832275, "kl": 0.054739379324018955, "learning_rate": 2.1786666666666666e-06, "loss": -0.0447, "num_tokens": 1028405.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.02080976776778698, "kl": 0.0014346256357384846, "learning_rate": 2.1783333333333334e-06, "loss": 0.0001, "num_tokens": 1028624.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 64.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 3.331995725631714, "kl": 0.045357080176472664, "learning_rate": 2.178e-06, "loss": -0.0537, "num_tokens": 1028925.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 64.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.013082461431622505, "kl": 0.0006137114251032472, "learning_rate": 2.177666666666667e-06, "loss": 0.0, "num_tokens": 1029159.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 64.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.07272084057331085, "kl": 0.004364542197436094, "learning_rate": 2.1773333333333333e-06, "loss": 0.0002, "num_tokens": 1029486.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 64.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0633191391825676, "kl": 0.003674787236377597, "learning_rate": 2.177e-06, "loss": 0.0002, "num_tokens": 1029750.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 64.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.010293482802808285, "kl": 0.014683597721159458, "learning_rate": 2.1766666666666664e-06, "loss": 0.0007, "num_tokens": 1030010.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 64.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.1277714967727661, "kl": 0.0244381595402956, "learning_rate": 2.1763333333333332e-06, "loss": 0.0012, "num_tokens": 1030333.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3472 }, { "clip_ratio/high_max": 0.0021276595070958138, "clip_ratio/high_mean": 0.0021276595070958138, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021276595070958138, "completion_length": 78.5, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 78.5, "completions/mean_terminated_length": 78.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 64.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 5.871088981628418, "kl": 0.05076884664595127, "learning_rate": 2.176e-06, "loss": 0.0865, "num_tokens": 1030863.0, "reward": 2.75, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.190238118171692, "step": 3473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 64.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.4247271716594696, "kl": 0.04272638913244009, "learning_rate": 2.1756666666666668e-06, "loss": 0.0021, "num_tokens": 1031175.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 64.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008122501894831657, "kl": 0.0012579411268234253, "learning_rate": 2.1753333333333336e-06, "loss": 0.0001, "num_tokens": 1031455.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 64.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07620169222354889, "kl": 0.016548125073313713, "learning_rate": 2.175e-06, "loss": 0.0008, "num_tokens": 1031750.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 64.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07757337391376495, "kl": 0.025181420147418976, "learning_rate": 2.1746666666666667e-06, "loss": 0.0013, "num_tokens": 1032109.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 64.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.2446059137582779, "kl": 0.03443950600922108, "learning_rate": 2.1743333333333335e-06, "loss": 0.0017, "num_tokens": 1032411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 64.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01193982269614935, "kl": 0.000527895987033844, "learning_rate": 2.1740000000000003e-06, "loss": 0.0, "num_tokens": 1032725.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 64.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.08313068747520447, "kl": 0.02006299328058958, "learning_rate": 2.1736666666666666e-06, "loss": 0.001, "num_tokens": 1033071.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 64.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.3883612155914307, "kl": 0.17425037547945976, "learning_rate": 2.1733333333333334e-06, "loss": 0.0479, "num_tokens": 1033408.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 3481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 64.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09303069114685059, "kl": 0.006640292820520699, "learning_rate": 2.173e-06, "loss": 0.0004, "num_tokens": 1033672.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.01857859455049038, "kl": 0.001703709363937378, "learning_rate": 2.1726666666666666e-06, "loss": 0.0001, "num_tokens": 1033888.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 64.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09799665957689285, "kl": 0.007824670989066362, "learning_rate": 2.1723333333333333e-06, "loss": 0.0004, "num_tokens": 1034181.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 64.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.005649257451295853, "kl": 0.0004132688045501709, "learning_rate": 2.172e-06, "loss": 0.0, "num_tokens": 1034441.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 64.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.091744899749756, "kl": 0.04026716947555542, "learning_rate": 2.171666666666667e-06, "loss": 0.1012, "num_tokens": 1034727.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 64.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.019582778215408325, "kl": 0.0008381694206036627, "learning_rate": 2.1713333333333333e-06, "loss": 0.0, "num_tokens": 1034991.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 64.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.4010632038116455, "kl": 0.0780010549351573, "learning_rate": 2.171e-06, "loss": 0.0308, "num_tokens": 1035295.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 64.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04042023792862892, "kl": 0.002430828579235822, "learning_rate": 2.1706666666666664e-06, "loss": 0.0001, "num_tokens": 1035593.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 64.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.300741195678711, "kl": 0.21128271240741014, "learning_rate": 2.170333333333333e-06, "loss": -0.0454, "num_tokens": 1035946.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 64.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.026651019230484962, "kl": 0.006115816533565521, "learning_rate": 2.1700000000000004e-06, "loss": 0.0003, "num_tokens": 1036234.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 64.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.049924980849027634, "kl": 0.0010692626237869263, "learning_rate": 2.1696666666666668e-06, "loss": 0.0001, "num_tokens": 1036442.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 64.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.594965696334839, "kl": 0.05903918435797095, "learning_rate": 2.1693333333333335e-06, "loss": -0.1254, "num_tokens": 1036782.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 3493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 64.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.014029777608811855, "kl": 0.0003986656665802002, "learning_rate": 2.169e-06, "loss": 0.0, "num_tokens": 1036994.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 64.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07352840155363083, "kl": 0.00826783082447946, "learning_rate": 2.1686666666666667e-06, "loss": 0.0004, "num_tokens": 1037262.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 64.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.05641140416264534, "kl": 0.009500264655798674, "learning_rate": 2.1683333333333335e-06, "loss": 0.0005, "num_tokens": 1037532.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 64.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771590918302536, "kl": 0.006277984473854303, "learning_rate": 2.1680000000000002e-06, "loss": 0.0003, "num_tokens": 1037836.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 64.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04200340434908867, "kl": 0.042444733902812004, "learning_rate": 2.1676666666666666e-06, "loss": 0.0021, "num_tokens": 1038243.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002923685824498534, "kl": 7.661432027816772e-05, "learning_rate": 2.1673333333333334e-06, "loss": 0.0, "num_tokens": 1038463.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 64.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05609564855694771, "kl": 0.01435369485989213, "learning_rate": 2.167e-06, "loss": 0.0008, "num_tokens": 1038749.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 64.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05979641154408455, "kl": 0.14969105273485184, "learning_rate": 2.1666666666666665e-06, "loss": 0.0075, "num_tokens": 1039060.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 64.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.044393863528966904, "kl": 0.005250045796856284, "learning_rate": 2.1663333333333333e-06, "loss": 0.0003, "num_tokens": 1039344.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031597509514540434, "kl": 0.003224901854991913, "learning_rate": 2.166e-06, "loss": 0.0002, "num_tokens": 1039580.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 64.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.353760004043579, "kl": 0.0762135200202465, "learning_rate": 2.165666666666667e-06, "loss": -0.1731, "num_tokens": 1039944.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 92.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 92.5, "completions/mean_terminated_length": 38.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 64.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.4393672943115234, "kl": 0.1531917154788971, "learning_rate": 2.1653333333333332e-06, "loss": 0.004, "num_tokens": 1040542.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 64.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.13756895065307617, "kl": 0.01514653256163001, "learning_rate": 2.165e-06, "loss": 0.0009, "num_tokens": 1040822.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 64.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 1.1609680652618408, "kl": 0.0838811844587326, "learning_rate": 2.1646666666666664e-06, "loss": 0.0042, "num_tokens": 1041066.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.657221257686615, "kl": 0.0866007343865931, "learning_rate": 2.1643333333333336e-06, "loss": 0.0026, "num_tokens": 1041320.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 64.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.043962541967630386, "kl": 0.0067591299302875996, "learning_rate": 2.1640000000000004e-06, "loss": 0.0004, "num_tokens": 1041590.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.0, "frac_reward_zero_std": 0.0, "grad_norm": 7.141776084899902, "kl": 0.03479503915878013, "learning_rate": 2.1636666666666667e-06, "loss": 0.044, "num_tokens": 1041914.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 65.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.9385812282562256, "kl": 0.050240641459822655, "learning_rate": 2.1633333333333335e-06, "loss": 0.0373, "num_tokens": 1042284.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 65.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.14359606802463531, "kl": 0.018712486838921905, "learning_rate": 2.163e-06, "loss": 0.0009, "num_tokens": 1042590.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 65.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 7.02384090423584, "kl": 0.012163963634520769, "learning_rate": 2.1626666666666667e-06, "loss": 0.1879, "num_tokens": 1042866.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 65.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.820356607437134, "kl": 0.11114047560840845, "learning_rate": 2.1623333333333334e-06, "loss": 0.023, "num_tokens": 1043153.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 65.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.009961447678506374, "kl": 0.014737188816070557, "learning_rate": 2.1620000000000002e-06, "loss": 0.0007, "num_tokens": 1043413.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.403839588165283, "kl": 0.009551340248435736, "learning_rate": 2.1616666666666666e-06, "loss": -0.0425, "num_tokens": 1043695.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 65.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.660060405731201, "kl": 0.025711173191666603, "learning_rate": 2.1613333333333334e-06, "loss": 0.1979, "num_tokens": 1044054.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04941617697477341, "kl": 0.006203743629157543, "learning_rate": 2.161e-06, "loss": 0.0003, "num_tokens": 1044322.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 65.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.6261063814163208, "kl": 0.39721184223890305, "learning_rate": 2.1606666666666665e-06, "loss": -0.0546, "num_tokens": 1044694.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 3519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 65.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.012366310693323612, "kl": 0.002239805646240711, "learning_rate": 2.1603333333333333e-06, "loss": 0.0001, "num_tokens": 1045006.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 65.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.33784300088882446, "kl": 0.026781465858221054, "learning_rate": 2.16e-06, "loss": 0.002, "num_tokens": 1045266.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.021401792764663696, "kl": 0.0011554970405995846, "learning_rate": 2.159666666666667e-06, "loss": 0.0001, "num_tokens": 1045562.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 65.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.01038454007357359, "kl": 0.0004698584525613114, "learning_rate": 2.1593333333333332e-06, "loss": 0.0, "num_tokens": 1045797.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 65.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 3.6637165546417236, "kl": 0.16146893240511417, "learning_rate": 2.159e-06, "loss": 0.0344, "num_tokens": 1046135.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 65.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.019443562254309654, "kl": 0.002966132014989853, "learning_rate": 2.1586666666666664e-06, "loss": 0.0001, "num_tokens": 1046403.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 65.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 4.420718669891357, "kl": 0.033904400654137135, "learning_rate": 2.1583333333333336e-06, "loss": -0.2862, "num_tokens": 1046780.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 65.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.8334104418754578, "kl": 0.1618670579046011, "learning_rate": 2.1580000000000003e-06, "loss": 0.0086, "num_tokens": 1047117.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 65.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 1.2802926301956177, "kl": 0.2563718780875206, "learning_rate": 2.1576666666666667e-06, "loss": 0.0105, "num_tokens": 1047415.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 65.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.058315105736255646, "kl": 0.2578686773777008, "learning_rate": 2.1573333333333335e-06, "loss": 0.0129, "num_tokens": 1047719.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 65.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0315011627972126, "kl": 0.04525031894445419, "learning_rate": 2.157e-06, "loss": 0.0023, "num_tokens": 1048123.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 65.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.009876912459731102, "kl": 0.0001265406608581543, "learning_rate": 2.1566666666666666e-06, "loss": 0.0, "num_tokens": 1048335.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 65.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.061519473791122437, "kl": 0.006890603573992848, "learning_rate": 2.1563333333333334e-06, "loss": 0.0004, "num_tokens": 1048665.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 65.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 5.128335475921631, "kl": 0.04269075766205788, "learning_rate": 2.156e-06, "loss": 0.0712, "num_tokens": 1048961.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 65.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.15544039011001587, "kl": 0.005920931696891785, "learning_rate": 2.1556666666666666e-06, "loss": 0.0004, "num_tokens": 1049167.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06937694549560547, "kl": 0.0030993041582405567, "learning_rate": 2.1553333333333333e-06, "loss": 0.0002, "num_tokens": 1049430.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 65.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021859167609363794, "kl": 8.884072303771973e-05, "learning_rate": 2.155e-06, "loss": 0.0, "num_tokens": 1049642.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 65.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.002480959054082632, "kl": 0.0033598914742469788, "learning_rate": 2.1546666666666665e-06, "loss": 0.0002, "num_tokens": 1049878.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 65.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.040756918489933014, "kl": 0.016550449654459953, "learning_rate": 2.1543333333333337e-06, "loss": 0.0008, "num_tokens": 1050170.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.048881612718105316, "kl": 0.006315226026345044, "learning_rate": 2.154e-06, "loss": 0.0003, "num_tokens": 1050458.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.027717215940356255, "kl": 0.005355539731681347, "learning_rate": 2.153666666666667e-06, "loss": 0.0003, "num_tokens": 1050730.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 65.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.031059106811881065, "kl": 0.0015912905109871645, "learning_rate": 2.153333333333333e-06, "loss": 0.0001, "num_tokens": 1050949.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 65.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0925585925579071, "kl": 0.05042813625186682, "learning_rate": 2.153e-06, "loss": 0.0025, "num_tokens": 1051261.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 65.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.8912014961242676, "kl": 0.048889182100538164, "learning_rate": 2.1526666666666668e-06, "loss": 0.0174, "num_tokens": 1051574.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 65.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.8815157413482666, "kl": 0.05955894012004137, "learning_rate": 2.1523333333333335e-06, "loss": -0.0723, "num_tokens": 1051917.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 3544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 65.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07245248556137085, "kl": 0.004652391420677304, "learning_rate": 2.1520000000000003e-06, "loss": 0.0003, "num_tokens": 1052156.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 65.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 4.429896831512451, "kl": 0.13123369216918945, "learning_rate": 2.1516666666666667e-06, "loss": -0.0239, "num_tokens": 1052464.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 65.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07170914113521576, "kl": 0.018998458050191402, "learning_rate": 2.1513333333333335e-06, "loss": 0.001, "num_tokens": 1052808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 65.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003020386502612382, "kl": 6.897002458572388e-05, "learning_rate": 2.151e-06, "loss": 0.0, "num_tokens": 1053028.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 65.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.32641804218292236, "kl": 0.06980939954519272, "learning_rate": 2.1506666666666666e-06, "loss": 0.0033, "num_tokens": 1053340.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 65.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.06636328995227814, "kl": 0.019938739016652107, "learning_rate": 2.1503333333333334e-06, "loss": 0.001, "num_tokens": 1053660.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 65.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 3.9884839057922363, "kl": 0.04400889610406011, "learning_rate": 2.15e-06, "loss": 0.0032, "num_tokens": 1053985.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 3551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.019705817103385925, "kl": 0.006303793750703335, "learning_rate": 2.1496666666666665e-06, "loss": 0.0003, "num_tokens": 1054273.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 65.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.022153222933411598, "kl": 0.0006537400186061859, "learning_rate": 2.1493333333333333e-06, "loss": 0.0, "num_tokens": 1054533.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 65.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.14532485604286194, "kl": 0.02496556844562292, "learning_rate": 2.149e-06, "loss": 0.001, "num_tokens": 1054799.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 65.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.060010410845279694, "kl": 0.0015776307554915547, "learning_rate": 2.148666666666667e-06, "loss": 0.0001, "num_tokens": 1055055.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.3620195388793945, "kl": 0.008782767690718174, "learning_rate": 2.1483333333333337e-06, "loss": 0.2029, "num_tokens": 1055395.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 65.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.013834363780915737, "kl": 0.0009517103608231992, "learning_rate": 2.148e-06, "loss": 0.0, "num_tokens": 1055655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.2045542448759079, "kl": 0.04649087227880955, "learning_rate": 2.147666666666667e-06, "loss": 0.0023, "num_tokens": 1055943.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.14134031534194946, "kl": 0.025466084945946932, "learning_rate": 2.147333333333333e-06, "loss": 0.0012, "num_tokens": 1056214.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 65.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.00889053288847208, "kl": 0.008961380459368229, "learning_rate": 2.147e-06, "loss": 0.0004, "num_tokens": 1056486.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 65.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.024751445278525352, "kl": 0.0030332550404637004, "learning_rate": 2.1466666666666667e-06, "loss": 0.0002, "num_tokens": 1056752.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00039961549919098616, "kl": 0.0012107896036468446, "learning_rate": 2.1463333333333335e-06, "loss": 0.0001, "num_tokens": 1057032.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 65.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.8616456389427185, "kl": 0.20754162967205048, "learning_rate": 2.1460000000000003e-06, "loss": 0.0104, "num_tokens": 1057400.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 66.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.12785805761814117, "kl": 0.010238808114081621, "learning_rate": 2.1456666666666666e-06, "loss": 0.0005, "num_tokens": 1057660.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 66.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.238522529602051, "kl": 0.09039102122187614, "learning_rate": 2.1453333333333334e-06, "loss": -0.088, "num_tokens": 1057990.0, "reward": 5.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.674234628677368, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009274467825889587, "kl": 0.008835397195070982, "learning_rate": 2.145e-06, "loss": 0.0004, "num_tokens": 1058262.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 66.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.4824423789978027, "kl": 0.03818679414689541, "learning_rate": 2.1446666666666666e-06, "loss": 0.0215, "num_tokens": 1058611.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 3567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 66.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.1615619659423828, "kl": 0.02284571062773466, "learning_rate": 2.1443333333333334e-06, "loss": 0.0012, "num_tokens": 1058875.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 66.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.20201905071735382, "kl": 0.056334005668759346, "learning_rate": 2.144e-06, "loss": 0.0027, "num_tokens": 1059214.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 66.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.11449134349822998, "kl": 0.015466476790606976, "learning_rate": 2.143666666666667e-06, "loss": 0.0005, "num_tokens": 1059468.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 66.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03221873193979263, "kl": 0.14802279323339462, "learning_rate": 2.1433333333333333e-06, "loss": 0.0074, "num_tokens": 1059780.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 66.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.887183666229248, "kl": 0.04516427032649517, "learning_rate": 2.143e-06, "loss": -0.0965, "num_tokens": 1060140.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 3572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 42.75, "completions/mean_terminated_length": 42.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 66.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.156033754348755, "kl": 0.13535203784704208, "learning_rate": 2.142666666666667e-06, "loss": -0.1149, "num_tokens": 1060531.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 3573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 66.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.02052474021911621, "kl": 0.004775805864483118, "learning_rate": 2.1423333333333336e-06, "loss": 0.0002, "num_tokens": 1060803.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 66.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.00621081842109561, "kl": 0.0018688691779971123, "learning_rate": 2.142e-06, "loss": 0.0001, "num_tokens": 1061115.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 66.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0538310632109642, "kl": 0.09351903200149536, "learning_rate": 2.1416666666666668e-06, "loss": 0.0048, "num_tokens": 1061483.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 66.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.037144873291254044, "kl": 0.007186120608821511, "learning_rate": 2.141333333333333e-06, "loss": 0.0004, "num_tokens": 1061772.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 66.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.13158610463142395, "kl": 0.014042772352695465, "learning_rate": 2.141e-06, "loss": 0.0007, "num_tokens": 1062031.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 66.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042854901403188705, "kl": 0.00015013217489467934, "learning_rate": 2.1406666666666667e-06, "loss": 0.0, "num_tokens": 1062251.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 66.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 2.590139150619507, "kl": 0.17878342419862747, "learning_rate": 2.1403333333333335e-06, "loss": 0.0085, "num_tokens": 1062557.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 66.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 1.7620913982391357, "kl": 0.03124841209501028, "learning_rate": 2.1400000000000003e-06, "loss": 0.09, "num_tokens": 1062982.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 3581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 66.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04130195826292038, "kl": 0.005478741135448217, "learning_rate": 2.1396666666666666e-06, "loss": 0.0003, "num_tokens": 1063250.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 66.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.2970640659332275, "kl": 0.01648002862930298, "learning_rate": 2.1393333333333334e-06, "loss": 0.0014, "num_tokens": 1063538.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 66.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04830180108547211, "kl": 0.00431014085188508, "learning_rate": 2.1389999999999998e-06, "loss": 0.0002, "num_tokens": 1063850.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 66.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.011130349710583687, "kl": 0.0005721114575862885, "learning_rate": 2.138666666666667e-06, "loss": 0.0, "num_tokens": 1064110.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 66.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00029078495572321117, "kl": 7.481127977371216e-05, "learning_rate": 2.1383333333333333e-06, "loss": 0.0, "num_tokens": 1064330.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 66.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 10.302885055541992, "kl": 1.9815361201763153, "learning_rate": 2.138e-06, "loss": 0.0318, "num_tokens": 1064628.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 66.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.008370169438421726, "kl": 0.00010447204113006592, "learning_rate": 2.137666666666667e-06, "loss": 0.0, "num_tokens": 1064840.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 66.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08436626195907593, "kl": 0.029362904839217663, "learning_rate": 2.1373333333333333e-06, "loss": 0.0015, "num_tokens": 1065201.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 66.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0347416065633297, "kl": 0.0031578628113493323, "learning_rate": 2.137e-06, "loss": 0.0002, "num_tokens": 1065471.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 66.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.3766398429870605, "kl": 0.04959502071142197, "learning_rate": 2.136666666666667e-06, "loss": 0.0242, "num_tokens": 1065805.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 66.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010061761364340782, "kl": 0.00023939609673107043, "learning_rate": 2.1363333333333336e-06, "loss": 0.0, "num_tokens": 1066061.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05835021659731865, "kl": 0.003006845712661743, "learning_rate": 2.136e-06, "loss": 0.0001, "num_tokens": 1066328.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 66.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.032158125191926956, "kl": 0.0005912482738494873, "learning_rate": 2.1356666666666667e-06, "loss": 0.0, "num_tokens": 1066536.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 66.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09703069925308228, "kl": 0.005293893162161112, "learning_rate": 2.135333333333333e-06, "loss": 0.0003, "num_tokens": 1066784.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 66.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06441410630941391, "kl": 0.013853597454726696, "learning_rate": 2.135e-06, "loss": 0.0007, "num_tokens": 1067111.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 66.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02377534843981266, "kl": 0.005178789375349879, "learning_rate": 2.1346666666666667e-06, "loss": 0.0003, "num_tokens": 1067389.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 66.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.039628561586141586, "kl": 0.008012112695723772, "learning_rate": 2.1343333333333335e-06, "loss": 0.0004, "num_tokens": 1067695.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 66.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.035798411816358566, "kl": 0.001465982524678111, "learning_rate": 2.1340000000000002e-06, "loss": 0.0001, "num_tokens": 1067957.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.04002681002020836, "kl": 0.0018101908499374986, "learning_rate": 2.1336666666666666e-06, "loss": 0.0001, "num_tokens": 1068241.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 66.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 1.8050531148910522, "kl": 0.00274230184732005, "learning_rate": 2.1333333333333334e-06, "loss": 0.0297, "num_tokens": 1068559.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 3601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 66.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.1238691657781601, "kl": 0.042561380192637444, "learning_rate": 2.133e-06, "loss": 0.0021, "num_tokens": 1068859.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 66.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007309718639589846, "kl": 0.0012424737215042114, "learning_rate": 2.132666666666667e-06, "loss": 0.0001, "num_tokens": 1069139.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0442817322909832, "kl": 0.009598115226253867, "learning_rate": 2.1323333333333333e-06, "loss": 0.0005, "num_tokens": 1069423.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 66.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 5.423217296600342, "kl": 0.2912338078022003, "learning_rate": 2.132e-06, "loss": -0.0729, "num_tokens": 1069693.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 66.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06318531185388565, "kl": 0.003326006233692169, "learning_rate": 2.131666666666667e-06, "loss": 0.0002, "num_tokens": 1069909.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 66.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.057171136140823364, "kl": 0.028508439660072327, "learning_rate": 2.1313333333333332e-06, "loss": 0.0014, "num_tokens": 1070246.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 66.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.021175920963287354, "kl": 0.009201560635119677, "learning_rate": 2.131e-06, "loss": 0.0005, "num_tokens": 1070548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 66.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.014865121804177761, "kl": 0.00020401179790496826, "learning_rate": 2.130666666666667e-06, "loss": 0.0, "num_tokens": 1070760.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 66.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005663186893798411, "kl": 7.785111847624648e-06, "learning_rate": 2.1303333333333336e-06, "loss": 0.0, "num_tokens": 1071030.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 66.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.002381725935265422, "kl": 0.003385581076145172, "learning_rate": 2.13e-06, "loss": 0.0002, "num_tokens": 1071266.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 66.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.923074245452881, "kl": 0.21537644416093826, "learning_rate": 2.1296666666666667e-06, "loss": 0.0855, "num_tokens": 1071628.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 66.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.13695292174816132, "kl": 0.025151774752885103, "learning_rate": 2.129333333333333e-06, "loss": 0.0013, "num_tokens": 1071920.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 66.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.013088555075228214, "kl": 0.0007829467358533293, "learning_rate": 2.129e-06, "loss": 0.0, "num_tokens": 1072155.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 66.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.027377335354685783, "kl": 0.004256198415532708, "learning_rate": 2.128666666666667e-06, "loss": 0.0002, "num_tokens": 1072477.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 66.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.23087462782859802, "kl": 0.01953520847018808, "learning_rate": 2.1283333333333334e-06, "loss": 0.001, "num_tokens": 1072775.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.15868648886680603, "kl": 0.026405527256429195, "learning_rate": 2.128e-06, "loss": 0.0013, "num_tokens": 1073061.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 67.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.12318392843008041, "kl": 0.014631701167672873, "learning_rate": 2.1276666666666666e-06, "loss": 0.0007, "num_tokens": 1073397.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 67.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023509350139647722, "kl": 0.0033880844712257385, "learning_rate": 2.1273333333333334e-06, "loss": 0.0002, "num_tokens": 1073633.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 67.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.013437964022159576, "kl": 0.00028736889362335205, "learning_rate": 2.127e-06, "loss": 0.0, "num_tokens": 1073845.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 67.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.006805545650422573, "kl": 0.0007283776940312237, "learning_rate": 2.126666666666667e-06, "loss": 0.0, "num_tokens": 1074105.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007575757801532745, "clip_ratio/low_min": 0.007575757801532745, "clip_ratio/region_mean": 0.007575757801532745, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 67.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.9388680458068848, "kl": 0.2690776288509369, "learning_rate": 2.1263333333333333e-06, "loss": -0.0365, "num_tokens": 1074468.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 3622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 67.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.045186080038547516, "kl": 0.0023676478303968906, "learning_rate": 2.126e-06, "loss": 0.0001, "num_tokens": 1074735.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 67.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.7333438396453857, "kl": 0.16704384982585907, "learning_rate": 2.125666666666667e-06, "loss": 0.04, "num_tokens": 1075071.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 67.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.6389694213867188, "kl": 0.05160256661474705, "learning_rate": 2.125333333333333e-06, "loss": -0.1682, "num_tokens": 1075422.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 67.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.014252202585339546, "kl": 0.0005327500402927399, "learning_rate": 2.125e-06, "loss": 0.0, "num_tokens": 1075682.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 67.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0724712610244751, "kl": 0.019095337949693203, "learning_rate": 2.1246666666666668e-06, "loss": 0.001, "num_tokens": 1076022.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 67.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.11407589167356491, "kl": 0.03594349976629019, "learning_rate": 2.1243333333333335e-06, "loss": 0.0018, "num_tokens": 1076317.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 67.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06395824998617172, "kl": 0.002608716531540267, "learning_rate": 2.124e-06, "loss": 0.0001, "num_tokens": 1076573.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 67.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.005554386414587498, "kl": 0.0009240607614628971, "learning_rate": 2.1236666666666667e-06, "loss": 0.0, "num_tokens": 1076885.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 67.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.023439912125468254, "kl": 0.0022011324763298035, "learning_rate": 2.123333333333333e-06, "loss": 0.0001, "num_tokens": 1077101.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 67.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.017596596851944923, "kl": 0.0027563442708924413, "learning_rate": 2.1230000000000003e-06, "loss": 0.0001, "num_tokens": 1077369.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 67.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.24695774912834167, "kl": 0.010047714225947857, "learning_rate": 2.122666666666667e-06, "loss": 0.0006, "num_tokens": 1077616.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 67.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.04528961703181267, "kl": 0.007282962556928396, "learning_rate": 2.1223333333333334e-06, "loss": 0.0004, "num_tokens": 1077938.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 67.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 1.12740957736969, "kl": 0.35478825867176056, "learning_rate": 2.122e-06, "loss": -0.0177, "num_tokens": 1078306.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 67.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.009686710312962532, "kl": 0.01479168375954032, "learning_rate": 2.1216666666666665e-06, "loss": 0.0007, "num_tokens": 1078566.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 67.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1068182960152626, "kl": 0.02172045409679413, "learning_rate": 2.1213333333333333e-06, "loss": 0.0011, "num_tokens": 1078860.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 67.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03950683772563934, "kl": 0.013203125447034836, "learning_rate": 2.121e-06, "loss": 0.0007, "num_tokens": 1079274.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 67.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.11037489771842957, "kl": 0.05069075897336006, "learning_rate": 2.120666666666667e-06, "loss": 0.0024, "num_tokens": 1079606.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 67.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07049866020679474, "kl": 0.005110967089422047, "learning_rate": 2.1203333333333332e-06, "loss": 0.0003, "num_tokens": 1079866.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 67.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.12705372273921967, "kl": 0.04665055498480797, "learning_rate": 2.12e-06, "loss": 0.0024, "num_tokens": 1080224.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 67.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.00845647044479847, "kl": 0.0003898218274116516, "learning_rate": 2.119666666666667e-06, "loss": 0.0, "num_tokens": 1080432.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 67.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002862463006749749, "kl": 7.816404104232788e-05, "learning_rate": 2.119333333333333e-06, "loss": 0.0, "num_tokens": 1080652.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 67.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04544331878423691, "kl": 0.006195220164954662, "learning_rate": 2.119e-06, "loss": 0.0003, "num_tokens": 1080938.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 67.5, "frac_reward_zero_std": 0.0, "grad_norm": 5.987448692321777, "kl": 0.17379155382514, "learning_rate": 2.1186666666666667e-06, "loss": 0.0829, "num_tokens": 1081272.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 67.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09284070134162903, "kl": 0.015048619359731674, "learning_rate": 2.1183333333333335e-06, "loss": 0.0007, "num_tokens": 1081600.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 67.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04682357981801033, "kl": 0.006960212951526046, "learning_rate": 2.118e-06, "loss": 0.0003, "num_tokens": 1081872.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 67.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0363360233604908, "kl": 0.004977188538759947, "learning_rate": 2.1176666666666667e-06, "loss": 0.0002, "num_tokens": 1082144.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 67.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.009375547990202904, "kl": 0.2671874761581421, "learning_rate": 2.117333333333333e-06, "loss": 0.0134, "num_tokens": 1082448.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 67.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04979020357131958, "kl": 0.005073887296020985, "learning_rate": 2.1170000000000002e-06, "loss": 0.0002, "num_tokens": 1082742.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 67.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.49454399943351746, "kl": 0.09017374366521835, "learning_rate": 2.116666666666667e-06, "loss": 0.0046, "num_tokens": 1083092.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 67.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.008618989959359169, "kl": 0.0005549837951548398, "learning_rate": 2.1163333333333334e-06, "loss": 0.0, "num_tokens": 1083327.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 67.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1355447769165039, "kl": 0.02079087868332863, "learning_rate": 2.116e-06, "loss": 0.0009, "num_tokens": 1083622.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 67.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.025502989068627357, "kl": 0.0020838241325691342, "learning_rate": 2.1156666666666665e-06, "loss": 0.0001, "num_tokens": 1083904.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 67.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 6.5850510597229, "kl": 0.3873444255441427, "learning_rate": 2.1153333333333333e-06, "loss": 0.0923, "num_tokens": 1084186.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 67.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042832233011722565, "kl": 0.00015462636656593531, "learning_rate": 2.115e-06, "loss": 0.0, "num_tokens": 1084406.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 67.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.11391454190015793, "kl": 0.055850550532341, "learning_rate": 2.114666666666667e-06, "loss": 0.0026, "num_tokens": 1084717.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 67.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.14850559830665588, "kl": 0.013492303434759378, "learning_rate": 2.1143333333333332e-06, "loss": 0.0006, "num_tokens": 1085019.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 67.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.09155723452568054, "kl": 0.006057455786503851, "learning_rate": 2.114e-06, "loss": 0.0003, "num_tokens": 1085319.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 67.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.1932889074087143, "kl": 0.007851041154935956, "learning_rate": 2.1136666666666668e-06, "loss": 0.0004, "num_tokens": 1085573.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 67.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.0203304290771484, "kl": 0.0762731246650219, "learning_rate": 2.113333333333333e-06, "loss": 0.004, "num_tokens": 1085863.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 67.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.06873319298028946, "kl": 0.014164955355226994, "learning_rate": 2.1130000000000004e-06, "loss": 0.0007, "num_tokens": 1086151.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 67.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05913740023970604, "kl": 0.022879963740706444, "learning_rate": 2.1126666666666667e-06, "loss": 0.0011, "num_tokens": 1086504.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 67.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.191492557525635, "kl": 0.013359299551666481, "learning_rate": 2.1123333333333335e-06, "loss": 0.0315, "num_tokens": 1086778.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 67.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06391195952892303, "kl": 0.001900363015010953, "learning_rate": 2.112e-06, "loss": 0.0001, "num_tokens": 1086991.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 67.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.2260262966156006, "kl": 0.02756652794778347, "learning_rate": 2.1116666666666666e-06, "loss": 0.0012, "num_tokens": 1087303.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 3666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 67.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05605637654662132, "kl": 0.008502446697093546, "learning_rate": 2.1113333333333334e-06, "loss": 0.0004, "num_tokens": 1087591.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 67.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.23070630431175232, "kl": 0.18261893093585968, "learning_rate": 2.111e-06, "loss": 0.0092, "num_tokens": 1087901.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 67.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.1883108913898468, "kl": 0.08544060960412025, "learning_rate": 2.110666666666667e-06, "loss": 0.0042, "num_tokens": 1088306.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 67.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.015478103421628475, "kl": 0.004814976826310158, "learning_rate": 2.1103333333333333e-06, "loss": 0.0002, "num_tokens": 1088574.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 67.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03014935925602913, "kl": 0.00230532290879637, "learning_rate": 2.11e-06, "loss": 0.0001, "num_tokens": 1088892.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.010596226900815964, "kl": 0.008203256875276566, "learning_rate": 2.1096666666666665e-06, "loss": 0.0004, "num_tokens": 1089164.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 68.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.005889325402677059, "kl": 0.0004180723044555634, "learning_rate": 2.1093333333333333e-06, "loss": 0.0, "num_tokens": 1089399.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.897578716278076, "kl": 0.018367459997534752, "learning_rate": 2.109e-06, "loss": 0.2064, "num_tokens": 1089690.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 68.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06088269129395485, "kl": 0.023177883587777615, "learning_rate": 2.108666666666667e-06, "loss": 0.0011, "num_tokens": 1090030.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 68.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 8.026470184326172, "kl": 0.04168402776122093, "learning_rate": 2.108333333333333e-06, "loss": 0.0905, "num_tokens": 1090308.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 68.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.11855748295783997, "kl": 0.009419201407581568, "learning_rate": 2.108e-06, "loss": 0.0005, "num_tokens": 1090650.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 68.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.025909509509801865, "kl": 0.002303547109477222, "learning_rate": 2.1076666666666668e-06, "loss": 0.0001, "num_tokens": 1090927.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 68.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.025309741497039795, "kl": 0.0025870114332064986, "learning_rate": 2.1073333333333335e-06, "loss": 0.0001, "num_tokens": 1091241.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 68.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03963938355445862, "kl": 0.0064029518398456275, "learning_rate": 2.1070000000000003e-06, "loss": 0.0003, "num_tokens": 1091537.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 68.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.17038275301456451, "kl": 0.00915285013616085, "learning_rate": 2.1066666666666667e-06, "loss": 0.0005, "num_tokens": 1091794.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 68.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068563055247068405, "kl": 0.00044394657015800476, "learning_rate": 2.1063333333333335e-06, "loss": 0.0, "num_tokens": 1092054.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 68.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03563537821173668, "kl": 0.005906453588977456, "learning_rate": 2.106e-06, "loss": 0.0003, "num_tokens": 1092328.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 68.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0603560172021389, "kl": 0.00466797745320946, "learning_rate": 2.1056666666666666e-06, "loss": 0.0002, "num_tokens": 1092596.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.00028345666942186654, "kl": 7.80150294303894e-05, "learning_rate": 2.1053333333333334e-06, "loss": 0.0, "num_tokens": 1092816.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 68.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04415438324213028, "kl": 0.0013375446433201432, "learning_rate": 2.105e-06, "loss": 0.0001, "num_tokens": 1093084.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 68.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.010908227413892746, "kl": 0.26693589985370636, "learning_rate": 2.104666666666667e-06, "loss": 0.0133, "num_tokens": 1093388.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 68.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.08371180295944214, "kl": 0.02724790945649147, "learning_rate": 2.1043333333333333e-06, "loss": 0.0014, "num_tokens": 1093753.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 68.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.4664881229400635, "kl": 0.02393741998821497, "learning_rate": 2.104e-06, "loss": -0.1478, "num_tokens": 1094143.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 68.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03195560351014137, "kl": 0.006068086950108409, "learning_rate": 2.1036666666666665e-06, "loss": 0.0003, "num_tokens": 1094434.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 68.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.525040626525879, "kl": 0.06875466764904559, "learning_rate": 2.1033333333333332e-06, "loss": 0.1545, "num_tokens": 1094711.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 68.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.3200722932815552, "kl": 0.04628994641825557, "learning_rate": 2.103e-06, "loss": 0.0022, "num_tokens": 1094980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 68.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.2970018982887268, "kl": 0.040442612022161484, "learning_rate": 2.102666666666667e-06, "loss": 0.002, "num_tokens": 1095303.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 68.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02367353066802025, "kl": 0.0021707614650949836, "learning_rate": 2.1023333333333336e-06, "loss": 0.0001, "num_tokens": 1095623.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 68.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06780219078063965, "kl": 0.0032763570779934525, "learning_rate": 2.102e-06, "loss": 0.0002, "num_tokens": 1095883.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.050598856061697006, "kl": 0.008046301547437906, "learning_rate": 2.1016666666666667e-06, "loss": 0.0004, "num_tokens": 1096172.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 68.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.7576327323913574, "kl": 0.062360092997550964, "learning_rate": 2.1013333333333335e-06, "loss": -0.0329, "num_tokens": 1096546.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 3697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 68.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0103458222001791, "kl": 0.014693498611450195, "learning_rate": 2.1010000000000003e-06, "loss": 0.0007, "num_tokens": 1096806.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.013172569684684277, "kl": 0.00020606070756912231, "learning_rate": 2.1006666666666667e-06, "loss": 0.0, "num_tokens": 1097018.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 68.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09893094748258591, "kl": 0.015347694512456656, "learning_rate": 2.1003333333333334e-06, "loss": 0.0008, "num_tokens": 1097306.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 68.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.2724671959877014, "kl": 0.02285251021385193, "learning_rate": 2.1e-06, "loss": 0.0011, "num_tokens": 1097550.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 68.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.11986280232667923, "kl": 0.06006912142038345, "learning_rate": 2.0996666666666666e-06, "loss": 0.003, "num_tokens": 1097955.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 68.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.031555116176605225, "kl": 0.15293735265731812, "learning_rate": 2.0993333333333334e-06, "loss": 0.0076, "num_tokens": 1098265.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 68.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.056920044124126434, "kl": 0.02557973563671112, "learning_rate": 2.099e-06, "loss": 0.0013, "num_tokens": 1098620.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.18703508377075195, "kl": 0.0259105428121984, "learning_rate": 2.098666666666667e-06, "loss": 0.0013, "num_tokens": 1098902.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.12755346298217773, "kl": 0.009138700319454074, "learning_rate": 2.0983333333333333e-06, "loss": 0.0006, "num_tokens": 1099147.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 68.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10238856077194214, "kl": 0.035535553470253944, "learning_rate": 2.098e-06, "loss": 0.0018, "num_tokens": 1099463.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 68.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.027655061334371567, "kl": 0.006079294253140688, "learning_rate": 2.0976666666666664e-06, "loss": 0.0003, "num_tokens": 1099767.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 68.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.004385136999189854, "kl": 0.00014868975267745554, "learning_rate": 2.0973333333333336e-06, "loss": 0.0, "num_tokens": 1099987.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 68.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.08086896687746048, "kl": 0.01650427095592022, "learning_rate": 2.097e-06, "loss": 0.0008, "num_tokens": 1100293.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 68.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.005045006982982159, "kl": 0.0011046454310417175, "learning_rate": 2.0966666666666668e-06, "loss": 0.0001, "num_tokens": 1100605.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 68.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.014279398135840893, "kl": 0.00030957162380218506, "learning_rate": 2.0963333333333336e-06, "loss": 0.0, "num_tokens": 1100817.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 68.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.09553996473550797, "kl": 0.02474972326308489, "learning_rate": 2.096e-06, "loss": 0.0012, "num_tokens": 1101162.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 68.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.2830212116241455, "kl": 0.05193134769797325, "learning_rate": 2.0956666666666667e-06, "loss": 0.1303, "num_tokens": 1101511.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 2.2090346813201904, "kl": 0.09855070896446705, "learning_rate": 2.0953333333333335e-06, "loss": 0.0045, "num_tokens": 1101801.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 68.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.8406456112861633, "kl": 0.07098456658422947, "learning_rate": 2.0950000000000003e-06, "loss": 0.0041, "num_tokens": 1102081.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.002291263546794653, "kl": 0.0033948197960853577, "learning_rate": 2.0946666666666666e-06, "loss": 0.0002, "num_tokens": 1102317.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 68.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.32667556405067444, "kl": 0.07127051055431366, "learning_rate": 2.0943333333333334e-06, "loss": 0.0035, "num_tokens": 1102658.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09607010334730148, "kl": 0.011841509491205215, "learning_rate": 2.0939999999999998e-06, "loss": 0.0006, "num_tokens": 1102960.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 68.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0636925920844078, "kl": 0.042949215858243406, "learning_rate": 2.0936666666666666e-06, "loss": 0.0022, "num_tokens": 1103326.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.013012955896556377, "kl": 0.00017833709716796875, "learning_rate": 2.0933333333333333e-06, "loss": 0.0, "num_tokens": 1103538.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.011885164305567741, "kl": 0.002513908431865275, "learning_rate": 2.093e-06, "loss": 0.0001, "num_tokens": 1103822.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 68.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.811704158782959, "kl": 0.0468473955988884, "learning_rate": 2.092666666666667e-06, "loss": 0.0902, "num_tokens": 1104134.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 68.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01888645812869072, "kl": 0.0030528414936270565, "learning_rate": 2.0923333333333333e-06, "loss": 0.0001, "num_tokens": 1104392.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 68.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.24757111072540283, "kl": 0.026431400794535875, "learning_rate": 2.092e-06, "loss": 0.0014, "num_tokens": 1104668.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 69.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08265064656734467, "kl": 0.016250974498689175, "learning_rate": 2.0916666666666664e-06, "loss": 0.0008, "num_tokens": 1104988.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 69.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.019190331920981407, "kl": 0.00369901186786592, "learning_rate": 2.0913333333333336e-06, "loss": 0.0002, "num_tokens": 1105260.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 69.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08026011288166046, "kl": 0.004230632446706295, "learning_rate": 2.091e-06, "loss": 0.0002, "num_tokens": 1105523.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 69.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.13460038602352142, "kl": 0.020469567651161924, "learning_rate": 2.0906666666666668e-06, "loss": 0.0011, "num_tokens": 1105829.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.14596623182296753, "kl": 0.029725193977355957, "learning_rate": 2.0903333333333335e-06, "loss": 0.0015, "num_tokens": 1106101.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 69.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.12871718406677246, "kl": 0.06440968252718449, "learning_rate": 2.09e-06, "loss": 0.0032, "num_tokens": 1106437.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 69.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.009899850934743881, "kl": 0.008623609319329262, "learning_rate": 2.0896666666666667e-06, "loss": 0.0004, "num_tokens": 1106709.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 69.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0689016580581665, "kl": 0.04338335618376732, "learning_rate": 2.0893333333333335e-06, "loss": 0.0022, "num_tokens": 1107009.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 69.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.8067128658294678, "kl": 0.023298457264900208, "learning_rate": 2.0890000000000002e-06, "loss": -0.0239, "num_tokens": 1107360.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 42.75, "completions/mean_terminated_length": 42.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 69.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.16316451132297516, "kl": 0.043815051671117544, "learning_rate": 2.0886666666666666e-06, "loss": 0.0018, "num_tokens": 1107751.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08839473128318787, "kl": 0.010210568085312843, "learning_rate": 2.0883333333333334e-06, "loss": 0.0005, "num_tokens": 1108040.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 69.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.09216146916151047, "kl": 0.13382838666439056, "learning_rate": 2.0879999999999997e-06, "loss": 0.0064, "num_tokens": 1108365.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 69.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07643798738718033, "kl": 0.0196725451387465, "learning_rate": 2.0876666666666665e-06, "loss": 0.001, "num_tokens": 1108700.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 69.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.139026165008545, "kl": 0.032755774445831776, "learning_rate": 2.0873333333333337e-06, "loss": -0.0303, "num_tokens": 1108988.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 69.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.019622936844825745, "kl": 0.0032946651335805655, "learning_rate": 2.087e-06, "loss": 0.0002, "num_tokens": 1109272.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 69.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04078260809183121, "kl": 0.004805737407878041, "learning_rate": 2.086666666666667e-06, "loss": 0.0002, "num_tokens": 1109594.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 69.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.15685616433620453, "kl": 0.00503178930375725, "learning_rate": 2.0863333333333332e-06, "loss": 0.0003, "num_tokens": 1109818.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 69.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.15890878438949585, "kl": 0.019384440034627914, "learning_rate": 2.086e-06, "loss": 0.001, "num_tokens": 1110078.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 69.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.012090597301721573, "kl": 0.0009440481662750244, "learning_rate": 2.085666666666667e-06, "loss": 0.0, "num_tokens": 1110288.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 69.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04344664514064789, "kl": 0.009759694337844849, "learning_rate": 2.0853333333333336e-06, "loss": 0.0005, "num_tokens": 1110572.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 69.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0350305549800396, "kl": 0.011384228244423866, "learning_rate": 2.085e-06, "loss": 0.0006, "num_tokens": 1110833.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 69.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.895107626914978, "kl": 0.2670859545469284, "learning_rate": 2.0846666666666667e-06, "loss": 0.0134, "num_tokens": 1111205.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01900782249867916, "kl": 0.0014404583489522338, "learning_rate": 2.0843333333333335e-06, "loss": 0.0001, "num_tokens": 1111524.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 69.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024961652234196663, "kl": 0.003362610936164856, "learning_rate": 2.084e-06, "loss": 0.0002, "num_tokens": 1111760.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 69.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.008233225904405117, "kl": 0.001639210619032383, "learning_rate": 2.0836666666666667e-06, "loss": 0.0001, "num_tokens": 1112072.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 69.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.388033390045166, "kl": 0.02213809033855796, "learning_rate": 2.0833333333333334e-06, "loss": 0.053, "num_tokens": 1112374.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 69.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 2.393929958343506, "kl": 0.1466339000617154, "learning_rate": 2.0830000000000002e-06, "loss": 0.0076, "num_tokens": 1112642.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 69.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04682360589504242, "kl": 0.04479285143315792, "learning_rate": 2.0826666666666666e-06, "loss": 0.0022, "num_tokens": 1113054.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 69.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.27522349357605, "kl": 0.1270275004208088, "learning_rate": 2.0823333333333334e-06, "loss": -0.0048, "num_tokens": 1113421.0, "reward": 5.625, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 2.462214469909668, "step": 3754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 69.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.006829011719673872, "kl": 5.1021575927734375e-05, "learning_rate": 2.0819999999999997e-06, "loss": 0.0, "num_tokens": 1113633.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 69.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.14577077329158783, "kl": 0.0053781368769705296, "learning_rate": 2.081666666666667e-06, "loss": 0.0004, "num_tokens": 1113873.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 69.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09836200624704361, "kl": 0.016962699592113495, "learning_rate": 2.0813333333333337e-06, "loss": 0.0008, "num_tokens": 1114192.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 71.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 71.75, "completions/mean_terminated_length": 10.333333969116211, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 69.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.552198886871338, "kl": 0.35889815050177276, "learning_rate": 2.081e-06, "loss": 0.4608, "num_tokens": 1114703.0, "reward": 4.925000190734863, "reward_std": 3.797696828842163, "rewards/reward_combined/mean": 4.925000190734863, "rewards/reward_combined/std": 3.797696828842163, "step": 3758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 69.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004982537589967251, "kl": 0.001121779263485223, "learning_rate": 2.080666666666667e-06, "loss": 0.0001, "num_tokens": 1115015.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04316097870469093, "kl": 0.007986365118995309, "learning_rate": 2.080333333333333e-06, "loss": 0.0004, "num_tokens": 1115306.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.014881937764585018, "kl": 0.0033111422089859843, "learning_rate": 2.08e-06, "loss": 0.0002, "num_tokens": 1115586.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 69.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.030525509268045425, "kl": 0.0018466348992660642, "learning_rate": 2.0796666666666668e-06, "loss": 0.0001, "num_tokens": 1115888.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 69.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.06862545013427734, "kl": 0.0027694710515788756, "learning_rate": 2.0793333333333336e-06, "loss": 0.0002, "num_tokens": 1116110.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 69.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.032411668449640274, "kl": 0.0027646422386169434, "learning_rate": 2.079e-06, "loss": 0.0001, "num_tokens": 1116370.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 69.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0300470981746912, "kl": 0.0009364724101033062, "learning_rate": 2.0786666666666667e-06, "loss": 0.0, "num_tokens": 1116626.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 69.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 4.757248878479004, "kl": 0.022038782015442848, "learning_rate": 2.0783333333333335e-06, "loss": 0.0977, "num_tokens": 1116899.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 69.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002891722833737731, "kl": 7.560849189758301e-05, "learning_rate": 2.078e-06, "loss": 0.0, "num_tokens": 1117119.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 69.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.009057600982487202, "kl": 0.2671954482793808, "learning_rate": 2.0776666666666666e-06, "loss": 0.0134, "num_tokens": 1117423.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 69.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.058537062257528305, "kl": 0.004185812082141638, "learning_rate": 2.0773333333333334e-06, "loss": 0.0002, "num_tokens": 1117671.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 69.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.008690671995282173, "kl": 0.0008343184599652886, "learning_rate": 2.077e-06, "loss": 0.0, "num_tokens": 1117943.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 69.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 6.12689733505249, "kl": 0.026604359038174152, "learning_rate": 2.0766666666666665e-06, "loss": 0.1347, "num_tokens": 1118256.0, "reward": 5.0, "reward_std": 5.0, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 5.0, "step": 3771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 60.5, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 60.5, "completions/mean_terminated_length": 60.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 69.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.4342854022979736, "kl": 0.031211985275149345, "learning_rate": 2.0763333333333333e-06, "loss": 0.3869, "num_tokens": 1118714.0, "reward": 2.799999952316284, "reward_std": 1.399999976158142, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 1.399999976158142, "step": 3772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 69.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.13445831835269928, "kl": 0.03361153043806553, "learning_rate": 2.0759999999999997e-06, "loss": 0.0017, "num_tokens": 1119029.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 69.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.035520054399967194, "kl": 0.015147535130381584, "learning_rate": 2.075666666666667e-06, "loss": 0.0008, "num_tokens": 1119403.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 69.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04940409958362579, "kl": 0.002904816879890859, "learning_rate": 2.0753333333333337e-06, "loss": 0.0001, "num_tokens": 1119729.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 69.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08960027247667313, "kl": 0.016207166947424412, "learning_rate": 2.075e-06, "loss": 0.0008, "num_tokens": 1120027.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 69.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.013842816464602947, "kl": 0.0006515667628264055, "learning_rate": 2.074666666666667e-06, "loss": 0.0, "num_tokens": 1120262.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.010114590637385845, "kl": 0.0011583297746255994, "learning_rate": 2.074333333333333e-06, "loss": 0.0001, "num_tokens": 1120544.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 69.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06705860048532486, "kl": 0.033969609066843987, "learning_rate": 2.074e-06, "loss": 0.0017, "num_tokens": 1120877.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 70.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.3373340666294098, "kl": 0.02420927892671898, "learning_rate": 2.0736666666666667e-06, "loss": 0.0011, "num_tokens": 1121137.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 70.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01912330836057663, "kl": 0.003732536919414997, "learning_rate": 2.0733333333333335e-06, "loss": 0.0002, "num_tokens": 1121415.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 70.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.1849440634250641, "kl": 0.030430767685174942, "learning_rate": 2.073e-06, "loss": 0.0015, "num_tokens": 1121714.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 70.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04467763379216194, "kl": 0.03808296099305153, "learning_rate": 2.0726666666666667e-06, "loss": 0.0019, "num_tokens": 1122119.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 70.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.17115505039691925, "kl": 0.016103115398436785, "learning_rate": 2.0723333333333335e-06, "loss": 0.0008, "num_tokens": 1122417.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 70.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.009636009112000465, "kl": 0.2670990973711014, "learning_rate": 2.072e-06, "loss": 0.0134, "num_tokens": 1122721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 70.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.894669532775879, "kl": 0.13417953625321388, "learning_rate": 2.071666666666667e-06, "loss": 0.0636, "num_tokens": 1123059.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 70.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03819984570145607, "kl": 0.005596211412921548, "learning_rate": 2.0713333333333334e-06, "loss": 0.0003, "num_tokens": 1123332.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 70.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00319188111461699, "kl": 0.0003730393946170807, "learning_rate": 2.071e-06, "loss": 0.0, "num_tokens": 1123592.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 70.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.10953184962272644, "kl": 0.01062619686126709, "learning_rate": 2.0706666666666665e-06, "loss": 0.0004, "num_tokens": 1123846.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 70.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.2885751724243164, "kl": 0.02905437909066677, "learning_rate": 2.0703333333333333e-06, "loss": 0.0015, "num_tokens": 1124104.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 70.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.09554505348205566, "kl": 0.00447315294877626, "learning_rate": 2.07e-06, "loss": 0.0002, "num_tokens": 1124361.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 70.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.011061636731028557, "kl": 0.01441561197862029, "learning_rate": 2.069666666666667e-06, "loss": 0.0007, "num_tokens": 1124621.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 70.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.4297924041748047, "kl": 0.038245189003646374, "learning_rate": 2.0693333333333337e-06, "loss": -0.0629, "num_tokens": 1124965.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 70.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.12185338884592056, "kl": 0.0289075025357306, "learning_rate": 2.069e-06, "loss": 0.0015, "num_tokens": 1125251.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 70.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.17212502658367157, "kl": 0.039578577503561974, "learning_rate": 2.068666666666667e-06, "loss": 0.002, "num_tokens": 1125612.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 70.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.02968708984553814, "kl": 0.0010114670731127262, "learning_rate": 2.068333333333333e-06, "loss": 0.0001, "num_tokens": 1125886.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 70.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.8477181792259216, "kl": 0.0689125619828701, "learning_rate": 2.068e-06, "loss": 0.0035, "num_tokens": 1126225.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 70.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.01068786345422268, "kl": 0.001261926256120205, "learning_rate": 2.0676666666666667e-06, "loss": 0.0001, "num_tokens": 1126543.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 70.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.17308853566646576, "kl": 0.01491808972787112, "learning_rate": 2.0673333333333335e-06, "loss": 0.0007, "num_tokens": 1126811.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 70.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.059425778687000275, "kl": 0.01658582268282771, "learning_rate": 2.067e-06, "loss": 0.0008, "num_tokens": 1127157.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 70.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05604563653469086, "kl": 0.007753998972475529, "learning_rate": 2.0666666666666666e-06, "loss": 0.0004, "num_tokens": 1127471.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 82.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 70.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.353825330734253, "kl": 0.046961236745119095, "learning_rate": 2.0663333333333334e-06, "loss": 0.4165, "num_tokens": 1128035.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 70.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 6.302901268005371, "kl": 0.003622759133577347, "learning_rate": 2.066e-06, "loss": 0.3528, "num_tokens": 1128271.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 70.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06839118897914886, "kl": 0.01825037319213152, "learning_rate": 2.065666666666667e-06, "loss": 0.0009, "num_tokens": 1128564.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 70.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.026509441435337067, "kl": 0.0017391294240951538, "learning_rate": 2.0653333333333334e-06, "loss": 0.0001, "num_tokens": 1128776.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 70.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.00028999053756706417, "kl": 7.423758506774902e-05, "learning_rate": 2.065e-06, "loss": 0.0, "num_tokens": 1128996.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 70.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.15088064968585968, "kl": 0.01860713306814432, "learning_rate": 2.0646666666666665e-06, "loss": 0.0009, "num_tokens": 1129261.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3807 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.014285714365541935, "clip_ratio/low_min": 0.014285714365541935, "clip_ratio/region_mean": 0.032142858020961285, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 70.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.893303632736206, "kl": 0.21711437217891216, "learning_rate": 2.0643333333333333e-06, "loss": 0.0234, "num_tokens": 1129548.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 70.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03259415924549103, "kl": 0.1569090113043785, "learning_rate": 2.064e-06, "loss": 0.0078, "num_tokens": 1129856.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 70.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.041647255420684814, "kl": 0.005442672874778509, "learning_rate": 2.063666666666667e-06, "loss": 0.0003, "num_tokens": 1130146.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 70.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03202512487769127, "kl": 0.0002814382314682007, "learning_rate": 2.0633333333333336e-06, "loss": 0.0, "num_tokens": 1130358.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 70.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0650842934846878, "kl": 0.011250387877225876, "learning_rate": 2.063e-06, "loss": 0.0006, "num_tokens": 1130626.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 70.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.09915147721767426, "kl": 0.10592405870556831, "learning_rate": 2.0626666666666668e-06, "loss": 0.0054, "num_tokens": 1130996.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 70.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06355968862771988, "kl": 0.008554394356906414, "learning_rate": 2.062333333333333e-06, "loss": 0.0004, "num_tokens": 1131325.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 70.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.30297115445137024, "kl": 0.029221629025414586, "learning_rate": 2.062e-06, "loss": 0.0015, "num_tokens": 1131636.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 70.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.00455054733902216, "kl": 0.0001924872340168804, "learning_rate": 2.0616666666666667e-06, "loss": 0.0, "num_tokens": 1131896.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 70.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.21917401254177094, "kl": 0.02348022977821529, "learning_rate": 2.0613333333333335e-06, "loss": 0.0011, "num_tokens": 1132178.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 70.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.6584258675575256, "kl": 0.11333750560879707, "learning_rate": 2.0610000000000003e-06, "loss": 0.006, "num_tokens": 1132482.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 70.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.976577281951904, "kl": 0.38584525883197784, "learning_rate": 2.0606666666666666e-06, "loss": 0.0175, "num_tokens": 1132804.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 70.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 3.068915605545044, "kl": 0.046415770426392555, "learning_rate": 2.0603333333333334e-06, "loss": -0.0044, "num_tokens": 1133141.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 70.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.9163060188293457, "kl": 0.006486825877800584, "learning_rate": 2.06e-06, "loss": 0.0128, "num_tokens": 1133478.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 70.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02635047771036625, "kl": 0.0011291744885966182, "learning_rate": 2.059666666666667e-06, "loss": 0.0001, "num_tokens": 1133713.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3822 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01785714365541935, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 70.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 4.870720863342285, "kl": 0.05522574670612812, "learning_rate": 2.0593333333333333e-06, "loss": -0.0024, "num_tokens": 1133983.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 70.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.09829191118478775, "kl": 0.0031094219521037303, "learning_rate": 2.059e-06, "loss": 0.0002, "num_tokens": 1134205.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 70.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.16233311593532562, "kl": 0.02425145683810115, "learning_rate": 2.0586666666666665e-06, "loss": 0.0013, "num_tokens": 1134532.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 70.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008066217415034771, "kl": 0.0011122801224701107, "learning_rate": 2.0583333333333332e-06, "loss": 0.0001, "num_tokens": 1134812.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 70.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08590252697467804, "kl": 0.002045929431915283, "learning_rate": 2.058e-06, "loss": 0.0001, "num_tokens": 1135016.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 70.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.009715806692838669, "kl": 0.008551953826099634, "learning_rate": 2.057666666666667e-06, "loss": 0.0004, "num_tokens": 1135288.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 70.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020628825295716524, "kl": 0.00343361496925354, "learning_rate": 2.0573333333333336e-06, "loss": 0.0002, "num_tokens": 1135524.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 70.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05215715244412422, "kl": 0.0027110169176012278, "learning_rate": 2.057e-06, "loss": 0.0001, "num_tokens": 1135824.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 70.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.003164144931361079, "kl": 0.00020235776901245117, "learning_rate": 2.0566666666666667e-06, "loss": 0.0, "num_tokens": 1136068.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 70.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07815029472112656, "kl": 0.00721925962716341, "learning_rate": 2.056333333333333e-06, "loss": 0.0004, "num_tokens": 1136358.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 70.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02345367521047592, "kl": 0.010234278626739979, "learning_rate": 2.0560000000000003e-06, "loss": 0.0006, "num_tokens": 1136741.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.048825547099113464, "kl": 0.007530066650360823, "learning_rate": 2.0556666666666667e-06, "loss": 0.0004, "num_tokens": 1137039.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003058707807213068, "kl": 6.46635890007019e-05, "learning_rate": 2.0553333333333334e-06, "loss": 0.0, "num_tokens": 1137259.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 71.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.2617404460906982, "kl": 0.03131976258009672, "learning_rate": 2.0550000000000002e-06, "loss": 0.0064, "num_tokens": 1137547.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.10208529233932495, "kl": 0.00822573306504637, "learning_rate": 2.0546666666666666e-06, "loss": 0.0004, "num_tokens": 1137843.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 71.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 7.862679481506348, "kl": 0.045759277418255806, "learning_rate": 2.0543333333333334e-06, "loss": 0.2109, "num_tokens": 1138128.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 3838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03401223570108414, "kl": 0.005740228341892362, "learning_rate": 2.054e-06, "loss": 0.0003, "num_tokens": 1138426.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 71.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04922451823949814, "kl": 0.004779013805091381, "learning_rate": 2.053666666666667e-06, "loss": 0.0002, "num_tokens": 1138722.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 71.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02833518199622631, "kl": 0.00251924991607666, "learning_rate": 2.0533333333333333e-06, "loss": 0.0001, "num_tokens": 1138934.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01930255815386772, "kl": 0.00031400471925735474, "learning_rate": 2.053e-06, "loss": 0.0, "num_tokens": 1139146.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 71.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.09901589155197144, "kl": 0.13444199413061142, "learning_rate": 2.0526666666666664e-06, "loss": 0.0067, "num_tokens": 1139518.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 71.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.008475382812321186, "kl": 0.0018011517822742462, "learning_rate": 2.0523333333333332e-06, "loss": 0.0001, "num_tokens": 1139830.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 71.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.20135091245174408, "kl": 0.035652560414746404, "learning_rate": 2.052e-06, "loss": 0.0018, "num_tokens": 1140098.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 71.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.197789192199707, "kl": 0.16001486778259277, "learning_rate": 2.0516666666666668e-06, "loss": -0.0722, "num_tokens": 1140464.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 3846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030362692195922136, "kl": 0.0003831550420727581, "learning_rate": 2.0513333333333336e-06, "loss": 0.0, "num_tokens": 1140683.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 71.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 3.256838083267212, "kl": 0.05211942456662655, "learning_rate": 2.051e-06, "loss": 0.04, "num_tokens": 1141092.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 3848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 71.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.016511255875229836, "kl": 0.001533987175207585, "learning_rate": 2.0506666666666667e-06, "loss": 0.0001, "num_tokens": 1141410.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 71.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.04304053634405136, "kl": 0.007785625057294965, "learning_rate": 2.050333333333333e-06, "loss": 0.0003, "num_tokens": 1141666.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 1.539617657661438, "kl": 0.03314585331827402, "learning_rate": 2.0500000000000003e-06, "loss": 0.4536, "num_tokens": 1142181.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 71.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.06793303787708282, "kl": 0.025253762491047382, "learning_rate": 2.0496666666666666e-06, "loss": 0.0013, "num_tokens": 1142474.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.908078670501709, "kl": 0.09509957768023014, "learning_rate": 2.0493333333333334e-06, "loss": 0.1365, "num_tokens": 1142769.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 71.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.013990159146487713, "kl": 0.013835089281201363, "learning_rate": 2.049e-06, "loss": 0.0007, "num_tokens": 1143029.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 71.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.029038339853286743, "kl": 0.001210396527312696, "learning_rate": 2.0486666666666666e-06, "loss": 0.0001, "num_tokens": 1143299.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.12904158234596252, "kl": 0.0114885657094419, "learning_rate": 2.0483333333333333e-06, "loss": 0.0006, "num_tokens": 1143567.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 71.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.977405309677124, "kl": 0.020577928982675076, "learning_rate": 2.048e-06, "loss": 0.0686, "num_tokens": 1143905.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 71.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008668515365570784, "kl": 0.0011933803907595575, "learning_rate": 2.047666666666667e-06, "loss": 0.0001, "num_tokens": 1144182.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 71.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05257728695869446, "kl": 0.008434228366240859, "learning_rate": 2.0473333333333333e-06, "loss": 0.0004, "num_tokens": 1144509.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 71.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.028920171782374382, "kl": 0.0005989249621052295, "learning_rate": 2.047e-06, "loss": 0.0, "num_tokens": 1144758.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 71.5, "frac_reward_zero_std": 1.0, "grad_norm": 1.2264093160629272, "kl": 0.18462391011416912, "learning_rate": 2.0466666666666664e-06, "loss": 0.0089, "num_tokens": 1145146.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08258006721735, "kl": 0.01812015101313591, "learning_rate": 2.046333333333333e-06, "loss": 0.001, "num_tokens": 1145435.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 71.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.256923198699951, "kl": 0.7270060628652573, "learning_rate": 2.0460000000000004e-06, "loss": 0.0602, "num_tokens": 1145740.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 71.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.1236493587493896, "kl": 0.22780824918299913, "learning_rate": 2.0456666666666668e-06, "loss": -0.0639, "num_tokens": 1146048.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 71.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.08658071607351303, "kl": 0.14387647807598114, "learning_rate": 2.0453333333333335e-06, "loss": 0.0071, "num_tokens": 1146365.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 71.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.050202954560518265, "kl": 0.005375309803639539, "learning_rate": 2.045e-06, "loss": 0.0003, "num_tokens": 1146637.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 71.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01984328404068947, "kl": 0.002279287320561707, "learning_rate": 2.0446666666666667e-06, "loss": 0.0001, "num_tokens": 1146946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 71.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.18695181608200073, "kl": 0.05382288992404938, "learning_rate": 2.0443333333333335e-06, "loss": 0.0026, "num_tokens": 1147225.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 71.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02230338379740715, "kl": 0.008178194984793663, "learning_rate": 2.0440000000000003e-06, "loss": 0.0004, "num_tokens": 1147493.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 88.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 88.25, "completions/mean_terminated_length": 32.333335876464844, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 71.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.9161524772644043, "kl": 0.033104510977864265, "learning_rate": 2.0436666666666666e-06, "loss": 0.3895, "num_tokens": 1148070.0, "reward": 5.300000190734863, "reward_std": 4.399999618530273, "rewards/reward_combined/mean": 5.300000190734863, "rewards/reward_combined/std": 4.400000095367432, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 71.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08402593433856964, "kl": 0.0031503804493695498, "learning_rate": 2.0433333333333334e-06, "loss": 0.0001, "num_tokens": 1148304.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.045012056827545166, "kl": 0.015415312722325325, "learning_rate": 2.043e-06, "loss": 0.0008, "num_tokens": 1148588.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 71.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06828045099973679, "kl": 0.0388933252543211, "learning_rate": 2.0426666666666665e-06, "loss": 0.0018, "num_tokens": 1148931.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 71.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 5.168416976928711, "kl": 0.3493591845035553, "learning_rate": 2.0423333333333333e-06, "loss": 0.0891, "num_tokens": 1149259.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 3874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 71.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.046541862189769745, "kl": 0.006941606290638447, "learning_rate": 2.042e-06, "loss": 0.0003, "num_tokens": 1149548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 71.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.799485445022583, "kl": 0.28889428451657295, "learning_rate": 2.041666666666667e-06, "loss": 0.02, "num_tokens": 1149887.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 3876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 71.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.953897476196289, "kl": 0.14878639951348305, "learning_rate": 2.0413333333333332e-06, "loss": 0.0078, "num_tokens": 1150193.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 3877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 71.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.6304640769958496, "kl": 0.047366587445139885, "learning_rate": 2.041e-06, "loss": -0.0338, "num_tokens": 1150540.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 3878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 71.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.037589848041534424, "kl": 0.003812772105447948, "learning_rate": 2.0406666666666664e-06, "loss": 0.0002, "num_tokens": 1150800.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009633529931306839, "kl": 0.004115089774131775, "learning_rate": 2.0403333333333336e-06, "loss": 0.0002, "num_tokens": 1151016.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 71.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.004128021188080311, "kl": 0.0003852955996990204, "learning_rate": 2.0400000000000004e-06, "loss": 0.0, "num_tokens": 1151276.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 71.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 6.983616352081299, "kl": 0.022872302681207657, "learning_rate": 2.0396666666666667e-06, "loss": -0.1267, "num_tokens": 1151594.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 8.120149612426758, "kl": 0.023815875872969627, "learning_rate": 2.0393333333333335e-06, "loss": 0.3868, "num_tokens": 1151857.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 3883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 71.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.11076006293296814, "kl": 0.0252806656062603, "learning_rate": 2.039e-06, "loss": 0.0013, "num_tokens": 1152200.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 71.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0407119020819664, "kl": 0.005279630655422807, "learning_rate": 2.0386666666666667e-06, "loss": 0.0003, "num_tokens": 1152472.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 71.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03676833212375641, "kl": 0.0010466799139976501, "learning_rate": 2.0383333333333334e-06, "loss": 0.0, "num_tokens": 1152682.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 71.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04379789158701897, "kl": 0.001031553721986711, "learning_rate": 2.0380000000000002e-06, "loss": 0.0001, "num_tokens": 1152938.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 72.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.7426738739013672, "kl": 0.02430872805416584, "learning_rate": 2.0376666666666666e-06, "loss": -0.0506, "num_tokens": 1153259.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 72.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010975892655551434, "kl": 0.0013699769624508917, "learning_rate": 2.0373333333333334e-06, "loss": 0.0001, "num_tokens": 1153579.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 72.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.18640156090259552, "kl": 0.030793381854891777, "learning_rate": 2.037e-06, "loss": 0.0015, "num_tokens": 1153899.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 72.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015834120567888021, "kl": 0.001036886496876832, "learning_rate": 2.0366666666666665e-06, "loss": 0.0001, "num_tokens": 1154195.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 72.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03269888833165169, "kl": 0.0013617835938930511, "learning_rate": 2.0363333333333333e-06, "loss": 0.0001, "num_tokens": 1154455.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.13884000480175018, "kl": 0.020147479372099042, "learning_rate": 2.036e-06, "loss": 0.0006, "num_tokens": 1154709.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 72.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.9379022121429443, "kl": 0.08437211811542511, "learning_rate": 2.035666666666667e-06, "loss": 0.0443, "num_tokens": 1155053.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018545180791988969, "kl": 0.003517255187034607, "learning_rate": 2.0353333333333332e-06, "loss": 0.0002, "num_tokens": 1155289.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 72.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.24216555058956146, "kl": 0.10458581149578094, "learning_rate": 2.035e-06, "loss": 0.0053, "num_tokens": 1155659.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 72.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.12269569933414459, "kl": 0.10996821895241737, "learning_rate": 2.0346666666666664e-06, "loss": 0.0054, "num_tokens": 1155969.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 72.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.02984246052801609, "kl": 0.006525772274471819, "learning_rate": 2.0343333333333336e-06, "loss": 0.0003, "num_tokens": 1156257.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 72.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 1.6496450901031494, "kl": 0.012791088549420238, "learning_rate": 2.0340000000000003e-06, "loss": -0.0001, "num_tokens": 1156557.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 72.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.6115683913230896, "kl": 0.08949461579322815, "learning_rate": 2.0336666666666667e-06, "loss": 0.0047, "num_tokens": 1156860.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 72.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.44598016142845154, "kl": 0.0961623266339302, "learning_rate": 2.0333333333333335e-06, "loss": 0.0049, "num_tokens": 1157214.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 72.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.1155848354101181, "kl": 0.010166168678551912, "learning_rate": 2.033e-06, "loss": 0.0005, "num_tokens": 1157510.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 72.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.019726382568478584, "kl": 0.041400933638215065, "learning_rate": 2.0326666666666666e-06, "loss": 0.0021, "num_tokens": 1157800.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 72.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.004721549805253744, "kl": 0.00033188238739967346, "learning_rate": 2.0323333333333334e-06, "loss": 0.0, "num_tokens": 1158044.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.75, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 72.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.119271993637085, "kl": 0.05787679785862565, "learning_rate": 2.032e-06, "loss": 0.0532, "num_tokens": 1158467.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 72.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.07601617276668549, "kl": 0.04474889859557152, "learning_rate": 2.0316666666666666e-06, "loss": 0.002, "num_tokens": 1158813.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009916847571730614, "kl": 0.004140004515647888, "learning_rate": 2.0313333333333333e-06, "loss": 0.0002, "num_tokens": 1159029.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 72.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.4983761310577393, "kl": 0.08018971979618073, "learning_rate": 2.031e-06, "loss": -0.1516, "num_tokens": 1159370.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 3908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 72.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.6229984760284424, "kl": 0.27881201915442944, "learning_rate": 2.0306666666666665e-06, "loss": -0.051, "num_tokens": 1159669.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 72.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.23746268451213837, "kl": 0.05401692911982536, "learning_rate": 2.0303333333333337e-06, "loss": 0.0027, "num_tokens": 1159965.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 72.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05387220159173012, "kl": 0.005998404783895239, "learning_rate": 2.03e-06, "loss": 0.0003, "num_tokens": 1160237.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 72.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.009281245060265064, "kl": 0.04096512123942375, "learning_rate": 2.029666666666667e-06, "loss": 0.002, "num_tokens": 1160642.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 72.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006973665440455079, "kl": 0.0011710290564224124, "learning_rate": 2.029333333333333e-06, "loss": 0.0001, "num_tokens": 1160922.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 72.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.033946286886930466, "kl": 0.0010177769872825593, "learning_rate": 2.029e-06, "loss": 0.0001, "num_tokens": 1161156.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 72.5, "frac_reward_zero_std": 1.0, "grad_norm": 2.4086551666259766, "kl": 0.17122620344161987, "learning_rate": 2.0286666666666668e-06, "loss": 0.0082, "num_tokens": 1161426.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 72.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10274406522512436, "kl": 0.03523481450974941, "learning_rate": 2.0283333333333335e-06, "loss": 0.0018, "num_tokens": 1161751.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002880946849472821, "kl": 7.771700620651245e-05, "learning_rate": 2.0280000000000003e-06, "loss": 0.0, "num_tokens": 1161971.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.07822265475988388, "kl": 0.0010858774185180664, "learning_rate": 2.0276666666666667e-06, "loss": 0.0001, "num_tokens": 1162183.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 72.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.009367442689836025, "kl": 0.0009228115668520331, "learning_rate": 2.0273333333333335e-06, "loss": 0.0, "num_tokens": 1162455.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 72.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.02175847254693508, "kl": 0.003536662319675088, "learning_rate": 2.027e-06, "loss": 0.0002, "num_tokens": 1162739.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 72.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.756463050842285, "kl": 0.026980872498825192, "learning_rate": 2.0266666666666666e-06, "loss": -0.0226, "num_tokens": 1163069.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 3921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 72.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05325916409492493, "kl": 0.011910397559404373, "learning_rate": 2.0263333333333334e-06, "loss": 0.0006, "num_tokens": 1163396.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04871179163455963, "kl": 0.0012043233145959675, "learning_rate": 2.026e-06, "loss": 0.0001, "num_tokens": 1163617.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.056555528193712234, "kl": 0.0010234564542770386, "learning_rate": 2.0256666666666665e-06, "loss": 0.0001, "num_tokens": 1163829.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 72.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.06023001670837402, "kl": 0.039688965305686, "learning_rate": 2.0253333333333333e-06, "loss": 0.002, "num_tokens": 1164101.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 72.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.008078550919890404, "kl": 0.009403749369084835, "learning_rate": 2.025e-06, "loss": 0.0005, "num_tokens": 1164373.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 72.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.248742580413818, "kl": 0.12861930206418037, "learning_rate": 2.024666666666667e-06, "loss": 0.0706, "num_tokens": 1164717.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 3927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 72.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.05203109234571457, "kl": 0.007720179157331586, "learning_rate": 2.0243333333333337e-06, "loss": 0.0004, "num_tokens": 1164985.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 72.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02307802066206932, "kl": 0.0005348950799088925, "learning_rate": 2.024e-06, "loss": 0.0, "num_tokens": 1165241.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 72.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.18737901747226715, "kl": 0.0535675473511219, "learning_rate": 2.023666666666667e-06, "loss": 0.0027, "num_tokens": 1165531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 72.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0984058827161789, "kl": 0.011192459613084793, "learning_rate": 2.023333333333333e-06, "loss": 0.0006, "num_tokens": 1165836.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 72.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.030288174748420715, "kl": 0.0008908142626751214, "learning_rate": 2.023e-06, "loss": 0.0, "num_tokens": 1166100.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 72.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.13756528496742249, "kl": 0.03603429440408945, "learning_rate": 2.0226666666666667e-06, "loss": 0.0019, "num_tokens": 1166428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 72.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.004450445994734764, "kl": 0.0013517257175408304, "learning_rate": 2.0223333333333335e-06, "loss": 0.0001, "num_tokens": 1166740.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 72.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.014697101898491383, "kl": 0.003958642482757568, "learning_rate": 2.0220000000000003e-06, "loss": 0.0002, "num_tokens": 1167020.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 96.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 43.333335876464844, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 72.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.1491636037826538, "kl": 0.040199367329478264, "learning_rate": 2.0216666666666667e-06, "loss": 0.4241, "num_tokens": 1167622.0, "reward": 1.0, "reward_std": 2.041241407394409, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 2.0412416458129883, "step": 3936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 72.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.036026231944561005, "kl": 0.001046881079673767, "learning_rate": 2.0213333333333334e-06, "loss": 0.0001, "num_tokens": 1167838.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 72.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0066398922353982925, "kl": 0.0015978366136550903, "learning_rate": 2.021e-06, "loss": 0.0001, "num_tokens": 1168150.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 72.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.28849008679389954, "kl": 0.03843085467815399, "learning_rate": 2.0206666666666666e-06, "loss": 0.0026, "num_tokens": 1168460.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 72.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1604706346988678, "kl": 0.024443178437650204, "learning_rate": 2.0203333333333334e-06, "loss": 0.0012, "num_tokens": 1168719.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 72.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.00877202209085226, "kl": 0.26718881726264954, "learning_rate": 2.02e-06, "loss": 0.0134, "num_tokens": 1169023.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.0, "frac_reward_zero_std": 0.0, "grad_norm": 7.530936241149902, "kl": 0.05582649423740804, "learning_rate": 2.019666666666667e-06, "loss": 0.0113, "num_tokens": 1169289.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 73.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.4480419158935547, "kl": 0.04963378421962261, "learning_rate": 2.0193333333333333e-06, "loss": 0.2837, "num_tokens": 1169643.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 3943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 73.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06555754691362381, "kl": 0.004164560232311487, "learning_rate": 2.019e-06, "loss": 0.0002, "num_tokens": 1169913.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 73.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0709647685289383, "kl": 0.0010515674948692322, "learning_rate": 2.018666666666667e-06, "loss": 0.0001, "num_tokens": 1170125.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 73.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11483173817396164, "kl": 0.016486276872456074, "learning_rate": 2.0183333333333336e-06, "loss": 0.0008, "num_tokens": 1170427.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 73.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.010684136301279068, "kl": 0.00032773341808933765, "learning_rate": 2.018e-06, "loss": 0.0, "num_tokens": 1170695.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05566558614373207, "kl": 0.020335861947387457, "learning_rate": 2.0176666666666668e-06, "loss": 0.001, "num_tokens": 1170983.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 73.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.24540309607982635, "kl": 0.018168319948017597, "learning_rate": 2.017333333333333e-06, "loss": 0.0009, "num_tokens": 1171332.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 73.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04036034643650055, "kl": 0.0038348076632246375, "learning_rate": 2.017e-06, "loss": 0.0002, "num_tokens": 1171598.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 73.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.2119781970977783, "kl": 0.0839585941284895, "learning_rate": 2.0166666666666667e-06, "loss": 0.0071, "num_tokens": 1171902.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 73.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.20628385245800018, "kl": 0.030969264917075634, "learning_rate": 2.0163333333333335e-06, "loss": 0.0018, "num_tokens": 1172170.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 73.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.038065794855356216, "kl": 0.00194305187324062, "learning_rate": 2.0160000000000003e-06, "loss": 0.0001, "num_tokens": 1172430.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 73.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.1545465886592865, "kl": 0.03250173863489181, "learning_rate": 2.0156666666666666e-06, "loss": 0.0016, "num_tokens": 1172740.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 73.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.026166506111621857, "kl": 0.04374006390571594, "learning_rate": 2.0153333333333334e-06, "loss": 0.0022, "num_tokens": 1173144.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 73.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02362995222210884, "kl": 0.0029923035763204098, "learning_rate": 2.0149999999999998e-06, "loss": 0.0001, "num_tokens": 1173440.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 73.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.6023633480072021, "kl": 0.0822620838880539, "learning_rate": 2.014666666666667e-06, "loss": 0.0045, "num_tokens": 1173811.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 73.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 8.950230598449707, "kl": 0.11748326430097222, "learning_rate": 2.0143333333333333e-06, "loss": -0.04, "num_tokens": 1174082.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 73.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.6445791721343994, "kl": 0.08978034928441048, "learning_rate": 2.014e-06, "loss": 0.064, "num_tokens": 1174435.0, "reward": 3.5, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 3.674234628677368, "step": 3959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 73.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.06531712412834167, "kl": 0.004923277534544468, "learning_rate": 2.013666666666667e-06, "loss": 0.0002, "num_tokens": 1174747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 73.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.006083305459469557, "kl": 0.00038892030715942383, "learning_rate": 2.0133333333333333e-06, "loss": 0.0, "num_tokens": 1175007.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 73.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.016587592661380768, "kl": 0.013436194974929094, "learning_rate": 2.013e-06, "loss": 0.0007, "num_tokens": 1175267.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.041599780321121216, "kl": 0.007704405812546611, "learning_rate": 2.012666666666667e-06, "loss": 0.0004, "num_tokens": 1175556.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 73.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.49184513092041, "kl": 0.013620304875075817, "learning_rate": 2.0123333333333336e-06, "loss": 0.0268, "num_tokens": 1175880.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 73.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.005873010493814945, "kl": 0.0004627754387911409, "learning_rate": 2.012e-06, "loss": 0.0, "num_tokens": 1176096.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 73.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05824138596653938, "kl": 0.008377837482839823, "learning_rate": 2.0116666666666667e-06, "loss": 0.0004, "num_tokens": 1176385.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 73.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.315967082977295, "kl": 0.13658245280385017, "learning_rate": 2.011333333333333e-06, "loss": 0.0387, "num_tokens": 1176695.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 73.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08227457851171494, "kl": 0.03622059337794781, "learning_rate": 2.011e-06, "loss": 0.0019, "num_tokens": 1177064.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 73.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.026277078315615654, "kl": 0.007054821355268359, "learning_rate": 2.0106666666666667e-06, "loss": 0.0004, "num_tokens": 1177366.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 73.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.179153561592102, "kl": 0.26764756441116333, "learning_rate": 2.0103333333333335e-06, "loss": 0.0132, "num_tokens": 1177670.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 73.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09660425037145615, "kl": 0.026360459625720978, "learning_rate": 2.0100000000000002e-06, "loss": 0.0014, "num_tokens": 1177968.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 73.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 5.040585517883301, "kl": 0.07141377031803131, "learning_rate": 2.0096666666666666e-06, "loss": -0.0769, "num_tokens": 1178254.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.465936660766602, "kl": 0.045494720339775085, "learning_rate": 2.0093333333333334e-06, "loss": 0.2404, "num_tokens": 1178549.0, "reward": 7.125, "reward_std": 0.75, "rewards/reward_combined/mean": 7.125, "rewards/reward_combined/std": 0.75, "step": 3973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 73.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00247369147837162, "kl": 0.0004306994378566742, "learning_rate": 2.0089999999999997e-06, "loss": 0.0, "num_tokens": 1178809.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 73.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.17805567383766174, "kl": 0.04457160085439682, "learning_rate": 2.008666666666667e-06, "loss": 0.0023, "num_tokens": 1179135.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 73.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.19030536711215973, "kl": 0.049640243873000145, "learning_rate": 2.0083333333333333e-06, "loss": 0.0023, "num_tokens": 1179419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 73.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.577756881713867, "kl": 0.06967569701373577, "learning_rate": 2.008e-06, "loss": 0.0198, "num_tokens": 1179748.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 3977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.18938206136226654, "kl": 0.0217663012444973, "learning_rate": 2.007666666666667e-06, "loss": 0.0011, "num_tokens": 1180021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 73.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.046438369899988174, "kl": 0.024857956916093826, "learning_rate": 2.0073333333333332e-06, "loss": 0.0011, "num_tokens": 1180406.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.024009445682168007, "kl": 0.0034509622491896152, "learning_rate": 2.007e-06, "loss": 0.0002, "num_tokens": 1180690.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 73.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.012450838461518288, "kl": 0.002151109278202057, "learning_rate": 2.006666666666667e-06, "loss": 0.0001, "num_tokens": 1181002.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 73.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.045024920254945755, "kl": 0.00181741124833934, "learning_rate": 2.0063333333333336e-06, "loss": 0.0001, "num_tokens": 1181258.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 73.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.01772375963628292, "kl": 0.0005734023579861969, "learning_rate": 2.006e-06, "loss": 0.0, "num_tokens": 1181493.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 73.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.004102497827261686, "kl": 0.00026682019233703613, "learning_rate": 2.0056666666666667e-06, "loss": 0.0, "num_tokens": 1181737.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 73.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03698328137397766, "kl": 0.0018593758286442608, "learning_rate": 2.005333333333333e-06, "loss": 0.0001, "num_tokens": 1181955.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 73.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.16760686039924622, "kl": 0.14208679646253586, "learning_rate": 2.005e-06, "loss": 0.0071, "num_tokens": 1182327.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 73.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.002071453956887126, "kl": 0.0034464672207832336, "learning_rate": 2.004666666666667e-06, "loss": 0.0002, "num_tokens": 1182563.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 73.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.052842941135168076, "kl": 0.0025670191971585155, "learning_rate": 2.0043333333333334e-06, "loss": 0.0001, "num_tokens": 1182894.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 73.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00026878848439082503, "kl": 8.477270603179932e-05, "learning_rate": 2.004e-06, "loss": 0.0, "num_tokens": 1183114.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 73.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.2190403938293457, "kl": 0.11757975816726685, "learning_rate": 2.0036666666666666e-06, "loss": -0.0726, "num_tokens": 1183456.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 73.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 14.3524808883667, "kl": 0.014463717117905617, "learning_rate": 2.0033333333333334e-06, "loss": 0.1639, "num_tokens": 1183675.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 73.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 21.022397994995117, "kl": 0.05136639624834061, "learning_rate": 2.003e-06, "loss": 0.0811, "num_tokens": 1183884.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 3992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 73.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.28949376940727234, "kl": 0.056278922595083714, "learning_rate": 2.002666666666667e-06, "loss": 0.003, "num_tokens": 1184197.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 73.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004327712522353977, "kl": 0.0012162349303252995, "learning_rate": 2.0023333333333333e-06, "loss": 0.0001, "num_tokens": 1184477.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 73.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04026157408952713, "kl": 0.0052518503507599235, "learning_rate": 2.002e-06, "loss": 0.0002, "num_tokens": 1184751.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 74.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.0968830585479736, "kl": 0.08913804963231087, "learning_rate": 2.001666666666667e-06, "loss": -0.0319, "num_tokens": 1185117.0, "reward": 6.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.674234628677368, "step": 3996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 74.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.448393821716309, "kl": 0.24949227273464203, "learning_rate": 2.001333333333333e-06, "loss": 0.0291, "num_tokens": 1185427.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 74.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07311049848794937, "kl": 0.0031819441937841475, "learning_rate": 2.001e-06, "loss": 0.0002, "num_tokens": 1185648.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.19473688304424286, "kl": 0.027633660472929478, "learning_rate": 2.0006666666666668e-06, "loss": 0.0012, "num_tokens": 1185940.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 74.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.12668731808662415, "kl": 0.018358412198722363, "learning_rate": 2.0003333333333336e-06, "loss": 0.0009, "num_tokens": 1186270.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 74.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.002034256234765053, "kl": 0.0034529119729995728, "learning_rate": 2e-06, "loss": 0.0002, "num_tokens": 1186506.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 74.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.552746534347534, "kl": 0.5999440615996718, "learning_rate": 1.9996666666666667e-06, "loss": 0.0618, "num_tokens": 1186767.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 74.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.4583177864551544, "kl": 0.0795559398829937, "learning_rate": 1.999333333333333e-06, "loss": 0.0041, "num_tokens": 1187063.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 74.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002645017229951918, "kl": 8.767843246459961e-05, "learning_rate": 1.9990000000000003e-06, "loss": 0.0, "num_tokens": 1187283.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 74.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 4.329433441162109, "kl": 0.11925218999385834, "learning_rate": 1.998666666666667e-06, "loss": -0.0591, "num_tokens": 1187642.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 4005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 74.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.15682768821716309, "kl": 0.10793199576437473, "learning_rate": 1.9983333333333334e-06, "loss": 0.0054, "num_tokens": 1188012.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 74.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.13586688041687, "kl": 0.18894056230783463, "learning_rate": 1.998e-06, "loss": 0.0798, "num_tokens": 1188406.0, "reward": 6.375, "reward_std": 2.136000871658325, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.136000871658325, "step": 4007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 74.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.008215570822358131, "kl": 0.001679474487900734, "learning_rate": 1.9976666666666665e-06, "loss": 0.0001, "num_tokens": 1188718.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 74.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.16053363680839539, "kl": 0.02550918608903885, "learning_rate": 1.9973333333333333e-06, "loss": 0.0013, "num_tokens": 1189006.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.09577256441116333, "kl": 0.02533774357289076, "learning_rate": 1.997e-06, "loss": 0.0012, "num_tokens": 1189279.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 74.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.022658009082078934, "kl": 0.0008329413831233978, "learning_rate": 1.996666666666667e-06, "loss": 0.0, "num_tokens": 1189539.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01015686709433794, "kl": 0.0028411494567990303, "learning_rate": 1.9963333333333332e-06, "loss": 0.0001, "num_tokens": 1189823.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 74.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0235056821256876, "kl": 0.043126437813043594, "learning_rate": 1.996e-06, "loss": 0.0021, "num_tokens": 1190235.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 74.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04940566420555115, "kl": 0.002448553335852921, "learning_rate": 1.995666666666667e-06, "loss": 0.0001, "num_tokens": 1190531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.17772062122821808, "kl": 0.037724267691373825, "learning_rate": 1.995333333333333e-06, "loss": 0.002, "num_tokens": 1190819.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 74.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 8.137699127197266, "kl": 0.2866111099720001, "learning_rate": 1.995e-06, "loss": 0.0304, "num_tokens": 1191124.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 74.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.029567595571279526, "kl": 0.00426570326089859, "learning_rate": 1.9946666666666667e-06, "loss": 0.0002, "num_tokens": 1191414.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.20623791217803955, "kl": 0.026099749375134706, "learning_rate": 1.9943333333333335e-06, "loss": 0.0013, "num_tokens": 1191714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 74.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06797938048839569, "kl": 0.005014072841731831, "learning_rate": 1.994e-06, "loss": 0.0003, "num_tokens": 1192057.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 74.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.027485482394695282, "kl": 0.0048094624653458595, "learning_rate": 1.9936666666666667e-06, "loss": 0.0002, "num_tokens": 1192377.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 74.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.6406961679458618, "kl": 0.2471523080021143, "learning_rate": 1.993333333333333e-06, "loss": -0.0213, "num_tokens": 1192677.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 74.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07411373406648636, "kl": 0.0016750767827033997, "learning_rate": 1.9930000000000002e-06, "loss": 0.0001, "num_tokens": 1192883.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 74.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.155965328216553, "kl": 0.06356980884447694, "learning_rate": 1.992666666666667e-06, "loss": 0.1141, "num_tokens": 1193213.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 74.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.015834394842386246, "kl": 0.0009228939306922257, "learning_rate": 1.9923333333333334e-06, "loss": 0.0, "num_tokens": 1193481.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 74.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.050818849354982376, "kl": 0.004813584499061108, "learning_rate": 1.992e-06, "loss": 0.0002, "num_tokens": 1193741.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 74.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 8.675247192382812, "kl": 0.023868614342063665, "learning_rate": 1.9916666666666665e-06, "loss": 0.066, "num_tokens": 1194017.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 74.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 4.288616180419922, "kl": 0.06694895215332508, "learning_rate": 1.9913333333333333e-06, "loss": 0.1487, "num_tokens": 1194405.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 4027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 74.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.034000832587480545, "kl": 0.0009955097339116037, "learning_rate": 1.991e-06, "loss": 0.0, "num_tokens": 1194640.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 74.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.11053633689880371, "kl": 0.031633369624614716, "learning_rate": 1.990666666666667e-06, "loss": 0.0016, "num_tokens": 1194993.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 74.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06636285781860352, "kl": 0.004719011951237917, "learning_rate": 1.9903333333333332e-06, "loss": 0.0002, "num_tokens": 1195236.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 74.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.4540215730667114, "kl": 0.06451453268527985, "learning_rate": 1.99e-06, "loss": 0.0227, "num_tokens": 1195570.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 74.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.013661684468388557, "kl": 0.0012496738927438855, "learning_rate": 1.9896666666666668e-06, "loss": 0.0001, "num_tokens": 1195830.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 74.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.016131972894072533, "kl": 0.0032341000624001026, "learning_rate": 1.989333333333333e-06, "loss": 0.0002, "num_tokens": 1196110.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 74.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.028665903955698013, "kl": 0.0018889158964157104, "learning_rate": 1.9890000000000004e-06, "loss": 0.0001, "num_tokens": 1196322.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 74.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.062368400394916534, "kl": 0.00603932049125433, "learning_rate": 1.9886666666666667e-06, "loss": 0.0003, "num_tokens": 1196588.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 74.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.00036966640618629754, "kl": 0.001296249101869762, "learning_rate": 1.9883333333333335e-06, "loss": 0.0001, "num_tokens": 1196865.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 74.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 6.538760662078857, "kl": 0.012847235600929707, "learning_rate": 1.988e-06, "loss": 0.0291, "num_tokens": 1197126.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.997783660888672, "kl": 0.029787374660372734, "learning_rate": 1.9876666666666666e-06, "loss": 0.258, "num_tokens": 1197426.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 4038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 74.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.06280207633972168, "kl": 0.015097802504897118, "learning_rate": 1.9873333333333334e-06, "loss": 0.0008, "num_tokens": 1197723.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 74.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.2232995331287384, "kl": 0.010640449821949005, "learning_rate": 1.987e-06, "loss": 0.0006, "num_tokens": 1197936.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 74.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.016356471925973892, "kl": 0.0002499848706065677, "learning_rate": 1.986666666666667e-06, "loss": 0.0, "num_tokens": 1198192.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 74.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08152364194393158, "kl": 0.015393751673400402, "learning_rate": 1.9863333333333333e-06, "loss": 0.0008, "num_tokens": 1198481.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 74.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.12052378803491592, "kl": 0.006488578743301332, "learning_rate": 1.986e-06, "loss": 0.0003, "num_tokens": 1198749.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 74.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04822550714015961, "kl": 0.012901182286441326, "learning_rate": 1.9856666666666665e-06, "loss": 0.0006, "num_tokens": 1199083.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 74.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.14844229817390442, "kl": 0.0360508244484663, "learning_rate": 1.9853333333333333e-06, "loss": 0.0018, "num_tokens": 1199409.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 74.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04989173635840416, "kl": 0.027683653868734837, "learning_rate": 1.985e-06, "loss": 0.0014, "num_tokens": 1199725.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 74.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05343243107199669, "kl": 0.015758017543703318, "learning_rate": 1.984666666666667e-06, "loss": 0.0008, "num_tokens": 1200138.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 74.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1706589311361313, "kl": 0.019260598346590996, "learning_rate": 1.984333333333333e-06, "loss": 0.001, "num_tokens": 1200406.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 74.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0880153700709343, "kl": 0.004230510909110308, "learning_rate": 1.984e-06, "loss": 0.0002, "num_tokens": 1200718.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.010300721041858196, "kl": 0.004091762006282806, "learning_rate": 1.9836666666666668e-06, "loss": 0.0002, "num_tokens": 1200934.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 75.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07099296897649765, "kl": 0.010204406222328544, "learning_rate": 1.983333333333333e-06, "loss": 0.0005, "num_tokens": 1201229.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0166774932295084, "kl": 0.0005081444978713989, "learning_rate": 1.9830000000000003e-06, "loss": 0.0, "num_tokens": 1201441.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 75.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.041643235832452774, "kl": 0.013038936536759138, "learning_rate": 1.9826666666666667e-06, "loss": 0.0007, "num_tokens": 1201729.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 75.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.08384043723344803, "kl": 0.004945088003296405, "learning_rate": 1.9823333333333335e-06, "loss": 0.0003, "num_tokens": 1201991.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 75.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08956126868724823, "kl": 0.016426128335297108, "learning_rate": 1.982e-06, "loss": 0.0009, "num_tokens": 1202273.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.09401659667491913, "kl": 0.0030057430267333984, "learning_rate": 1.9816666666666666e-06, "loss": 0.0002, "num_tokens": 1202485.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 75.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06951751559972763, "kl": 0.03467301279306412, "learning_rate": 1.9813333333333334e-06, "loss": 0.0012, "num_tokens": 1202837.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 75.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03010636568069458, "kl": 0.04167941212654114, "learning_rate": 1.981e-06, "loss": 0.0021, "num_tokens": 1203128.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 75.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.1628406047821045, "kl": 0.004340556683018804, "learning_rate": 1.980666666666667e-06, "loss": 0.0197, "num_tokens": 1203418.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 4059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 75.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 1.766680121421814, "kl": 0.185578390955925, "learning_rate": 1.9803333333333333e-06, "loss": -0.009, "num_tokens": 1203799.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 75.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.08750255405902863, "kl": 0.008228718303143978, "learning_rate": 1.98e-06, "loss": 0.0004, "num_tokens": 1204108.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 75.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.011226206086575985, "kl": 0.26686032116413116, "learning_rate": 1.9796666666666665e-06, "loss": 0.0133, "num_tokens": 1204412.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017707949737086892, "kl": 0.0035208910703659058, "learning_rate": 1.9793333333333332e-06, "loss": 0.0002, "num_tokens": 1204648.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 75.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.4164939522743225, "kl": 0.049409836530685425, "learning_rate": 1.979e-06, "loss": 0.0025, "num_tokens": 1204947.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 75.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.021025974303483963, "kl": 0.002871686825528741, "learning_rate": 1.978666666666667e-06, "loss": 0.0001, "num_tokens": 1205243.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 75.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.06047816202044487, "kl": 0.0044610954355448484, "learning_rate": 1.9783333333333336e-06, "loss": 0.0002, "num_tokens": 1205517.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 75.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.2698228657245636, "kl": 0.026295214891433716, "learning_rate": 1.978e-06, "loss": 0.0013, "num_tokens": 1205851.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 75.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.3328896462917328, "kl": 0.052076924592256546, "learning_rate": 1.9776666666666667e-06, "loss": 0.0031, "num_tokens": 1206137.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 75.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.028835749253630638, "kl": 0.15647006034851074, "learning_rate": 1.9773333333333335e-06, "loss": 0.0078, "num_tokens": 1206446.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 75.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09617263823747635, "kl": 0.1250823512673378, "learning_rate": 1.9770000000000003e-06, "loss": 0.0063, "num_tokens": 1206818.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 75.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.27157941460609436, "kl": 0.01233864901587367, "learning_rate": 1.9766666666666667e-06, "loss": 0.0006, "num_tokens": 1207096.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 75.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.6838143467903137, "kl": 0.13872230052947998, "learning_rate": 1.9763333333333334e-06, "loss": 0.007, "num_tokens": 1207429.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 75.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.1603475660085678, "kl": 0.05809360183775425, "learning_rate": 1.976e-06, "loss": 0.0029, "num_tokens": 1207752.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 75.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.044732771813869476, "kl": 0.0029091377509757876, "learning_rate": 1.9756666666666666e-06, "loss": 0.0001, "num_tokens": 1208061.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 75.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.10663247853517532, "kl": 0.029886224307119846, "learning_rate": 1.9753333333333334e-06, "loss": 0.0015, "num_tokens": 1208359.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 75.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019416072173044086, "kl": 0.00015534833073616028, "learning_rate": 1.975e-06, "loss": 0.0, "num_tokens": 1208603.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 75.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.041577428579330444, "kl": 0.003687289310619235, "learning_rate": 1.974666666666667e-06, "loss": 0.0002, "num_tokens": 1208867.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 75.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 6.235536098480225, "kl": 0.772625168930972, "learning_rate": 1.9743333333333333e-06, "loss": 0.2187, "num_tokens": 1209150.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 75.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.10694680362939835, "kl": 0.021603311877697706, "learning_rate": 1.974e-06, "loss": 0.0012, "num_tokens": 1209422.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.11569098383188248, "kl": 0.006086892099119723, "learning_rate": 1.9736666666666664e-06, "loss": 0.0002, "num_tokens": 1209676.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 75.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.2142038643360138, "kl": 0.020505985245108604, "learning_rate": 1.9733333333333336e-06, "loss": 0.0014, "num_tokens": 1209906.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 75.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.027767445892095566, "kl": 0.0015846788883209229, "learning_rate": 1.973e-06, "loss": 0.0001, "num_tokens": 1210118.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 75.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1347692906856537, "kl": 0.02943311259150505, "learning_rate": 1.9726666666666668e-06, "loss": 0.0016, "num_tokens": 1210448.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 75.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.636884868144989, "kl": 0.04091236554086208, "learning_rate": 1.9723333333333336e-06, "loss": 0.0023, "num_tokens": 1210715.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 75.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.6120728254318237, "kl": 0.03641462483210489, "learning_rate": 1.972e-06, "loss": 0.0048, "num_tokens": 1210995.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 75.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.010601680725812912, "kl": 0.01476895809173584, "learning_rate": 1.9716666666666667e-06, "loss": 0.0007, "num_tokens": 1211255.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 75.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.028710326179862022, "kl": 0.0011701772746164352, "learning_rate": 1.9713333333333335e-06, "loss": 0.0001, "num_tokens": 1211525.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 75.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.30854856967926025, "kl": 0.022281265817582607, "learning_rate": 1.9710000000000003e-06, "loss": 0.0012, "num_tokens": 1211852.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 75.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08179502189159393, "kl": 0.010333703365176916, "learning_rate": 1.9706666666666666e-06, "loss": 0.0005, "num_tokens": 1212182.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 75.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03348976746201515, "kl": 0.0028405068442225456, "learning_rate": 1.9703333333333334e-06, "loss": 0.0001, "num_tokens": 1212494.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 75.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0213506780564785, "kl": 0.0006847196927992627, "learning_rate": 1.9699999999999998e-06, "loss": 0.0, "num_tokens": 1212729.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 75.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04438813403248787, "kl": 0.0009940594318322837, "learning_rate": 1.9696666666666666e-06, "loss": 0.0, "num_tokens": 1212985.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 75.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.039385054260492325, "kl": 0.027652304619550705, "learning_rate": 1.9693333333333333e-06, "loss": 0.0014, "num_tokens": 1213304.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 75.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.016336074098944664, "kl": 0.0006586983799934387, "learning_rate": 1.969e-06, "loss": 0.0, "num_tokens": 1213564.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002488235186319798, "kl": 9.147077798843384e-05, "learning_rate": 1.968666666666667e-06, "loss": 0.0, "num_tokens": 1213784.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 75.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.4082089960575104, "kl": 0.09059792757034302, "learning_rate": 1.9683333333333333e-06, "loss": 0.0033, "num_tokens": 1214149.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 75.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.916356086730957, "kl": 0.010026805510278791, "learning_rate": 1.968e-06, "loss": 0.1483, "num_tokens": 1214480.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 75.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.6402992010116577, "kl": 0.08111809473484755, "learning_rate": 1.9676666666666664e-06, "loss": -0.0642, "num_tokens": 1214817.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 4098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 75.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.026547765359282494, "kl": 0.008576929569244385, "learning_rate": 1.9673333333333336e-06, "loss": 0.0004, "num_tokens": 1215139.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 75.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 5.0618815422058105, "kl": 0.014985653106123209, "learning_rate": 1.967e-06, "loss": 0.2613, "num_tokens": 1215452.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 75.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.014280945062637329, "kl": 0.002806713921017945, "learning_rate": 1.9666666666666668e-06, "loss": 0.0001, "num_tokens": 1215736.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01078316941857338, "kl": 0.0038907453417778015, "learning_rate": 1.9663333333333335e-06, "loss": 0.0002, "num_tokens": 1215952.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 75.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.024382075294852257, "kl": 0.04322320222854614, "learning_rate": 1.966e-06, "loss": 0.0022, "num_tokens": 1216356.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003846153849735856, "clip_ratio/low_min": 0.003846153849735856, "clip_ratio/region_mean": 0.003846153849735856, "completion_length": 49.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 76.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.079254627227783, "kl": 0.03499941527843475, "learning_rate": 1.9656666666666667e-06, "loss": 0.2508, "num_tokens": 1216776.0, "reward": 5.625, "reward_std": 3.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 3.75, "step": 4104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 76.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.3840368390083313, "kl": 0.057057078927755356, "learning_rate": 1.9653333333333335e-06, "loss": 0.0025, "num_tokens": 1217045.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 76.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.024285269901156425, "kl": 0.0397414518520236, "learning_rate": 1.9650000000000002e-06, "loss": 0.002, "num_tokens": 1217335.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 76.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.010353813879191875, "kl": 0.00384463369846344, "learning_rate": 1.9646666666666666e-06, "loss": 0.0002, "num_tokens": 1217551.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 76.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.4024927616119385, "kl": 0.016710405237972736, "learning_rate": 1.9643333333333334e-06, "loss": 0.0268, "num_tokens": 1217884.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 76.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05337845906615257, "kl": 0.15523239225149155, "learning_rate": 1.9639999999999997e-06, "loss": 0.0078, "num_tokens": 1218194.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 76.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1349731832742691, "kl": 0.008913870726246387, "learning_rate": 1.9636666666666665e-06, "loss": 0.0004, "num_tokens": 1218460.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 76.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.7003679275512695, "kl": 0.014345615170896053, "learning_rate": 1.9633333333333337e-06, "loss": 0.2829, "num_tokens": 1218752.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 76.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.5468757748603821, "kl": 0.05816850659903139, "learning_rate": 1.963e-06, "loss": 0.0039, "num_tokens": 1219023.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 76.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.014043119736015797, "kl": 0.002794499625451863, "learning_rate": 1.962666666666667e-06, "loss": 0.0001, "num_tokens": 1219307.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 76.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 2.786628007888794, "kl": 0.03232190012931824, "learning_rate": 1.9623333333333332e-06, "loss": 0.0844, "num_tokens": 1219629.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 76.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 3.2288224697113037, "kl": 0.015211954480037093, "learning_rate": 1.962e-06, "loss": -0.006, "num_tokens": 1219961.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 76.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.009010828100144863, "kl": 0.04074282944202423, "learning_rate": 1.961666666666667e-06, "loss": 0.002, "num_tokens": 1220366.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 76.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0738772600889206, "kl": 0.011800558771938086, "learning_rate": 1.9613333333333336e-06, "loss": 0.0006, "num_tokens": 1220634.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 76.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.1359788477420807, "kl": 0.026944361627101898, "learning_rate": 1.961e-06, "loss": 0.0014, "num_tokens": 1220956.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 76.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09950874745845795, "kl": 0.002559813321568072, "learning_rate": 1.9606666666666667e-06, "loss": 0.0002, "num_tokens": 1221166.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 76.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.12391962110996246, "kl": 0.02209593402221799, "learning_rate": 1.9603333333333335e-06, "loss": 0.0013, "num_tokens": 1221448.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 76.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.06753460317850113, "kl": 0.0011758595937862992, "learning_rate": 1.96e-06, "loss": 0.0001, "num_tokens": 1221704.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 76.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.08313508331775665, "kl": 0.004922310006804764, "learning_rate": 1.9596666666666667e-06, "loss": 0.0002, "num_tokens": 1221969.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 76.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002995326940435916, "kl": 7.545948028564453e-05, "learning_rate": 1.9593333333333334e-06, "loss": 0.0, "num_tokens": 1222189.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 76.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.14050045609474182, "kl": 0.026244450360536575, "learning_rate": 1.9590000000000002e-06, "loss": 0.0013, "num_tokens": 1222515.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 76.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.001680325367487967, "kl": 0.00012325122952461243, "learning_rate": 1.9586666666666666e-06, "loss": 0.0, "num_tokens": 1222759.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 76.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.5730146169662476, "kl": 0.05556255113333464, "learning_rate": 1.9583333333333334e-06, "loss": 0.003, "num_tokens": 1223096.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 76.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.061254117637872696, "kl": 0.002744505414739251, "learning_rate": 1.9579999999999997e-06, "loss": 0.0001, "num_tokens": 1223392.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 76.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07077538967132568, "kl": 0.0038628350594080985, "learning_rate": 1.957666666666667e-06, "loss": 0.0002, "num_tokens": 1223713.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 76.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732441321015358, "kl": 0.008444469727692194, "learning_rate": 1.9573333333333337e-06, "loss": 0.0005, "num_tokens": 1223986.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 76.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05600501224398613, "kl": 0.0019288398325443268, "learning_rate": 1.957e-06, "loss": 0.0001, "num_tokens": 1224246.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 76.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069036902859807014, "kl": 0.0007605880673509091, "learning_rate": 1.956666666666667e-06, "loss": 0.0, "num_tokens": 1224506.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 76.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009192047640681267, "kl": 0.001749998889863491, "learning_rate": 1.956333333333333e-06, "loss": 0.0001, "num_tokens": 1224818.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 76.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.12667137384414673, "kl": 0.042274574749171734, "learning_rate": 1.956e-06, "loss": 0.002, "num_tokens": 1225119.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 76.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.08845563232898712, "kl": 0.022082612849771976, "learning_rate": 1.9556666666666668e-06, "loss": 0.0011, "num_tokens": 1225461.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 76.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11005394905805588, "kl": 0.008632947457954288, "learning_rate": 1.9553333333333336e-06, "loss": 0.0004, "num_tokens": 1225790.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 76.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08012217283248901, "kl": 0.11442025378346443, "learning_rate": 1.955e-06, "loss": 0.0057, "num_tokens": 1226162.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 76.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.3155250549316406, "kl": 0.3680860660970211, "learning_rate": 1.9546666666666667e-06, "loss": -0.0116, "num_tokens": 1226499.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 76.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.012257328256964684, "kl": 0.26668278872966766, "learning_rate": 1.9543333333333335e-06, "loss": 0.0133, "num_tokens": 1226803.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 76.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027750497683882713, "kl": 6.220936666068155e-05, "learning_rate": 1.954e-06, "loss": 0.0, "num_tokens": 1227023.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 76.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.09977468103170395, "kl": 0.012493321672081947, "learning_rate": 1.9536666666666666e-06, "loss": 0.0006, "num_tokens": 1227312.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 76.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 1.7096132040023804, "kl": 0.25764910224825144, "learning_rate": 1.9533333333333334e-06, "loss": 0.014, "num_tokens": 1227599.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 76.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 3.3478972911834717, "kl": 0.0591567512601614, "learning_rate": 1.953e-06, "loss": 0.1207, "num_tokens": 1227944.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 76.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.13027948141098022, "kl": 0.009938912931829691, "learning_rate": 1.9526666666666665e-06, "loss": 0.0005, "num_tokens": 1228215.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 76.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.021632181480526924, "kl": 0.0007337778806686401, "learning_rate": 1.9523333333333333e-06, "loss": 0.0, "num_tokens": 1228427.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 76.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.11164669692516327, "kl": 0.03634909354150295, "learning_rate": 1.9519999999999997e-06, "loss": 0.0018, "num_tokens": 1228797.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 76.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.002292829332873225, "kl": 0.0034130513668060303, "learning_rate": 1.951666666666667e-06, "loss": 0.0002, "num_tokens": 1229033.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 76.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01807132549583912, "kl": 0.0042166029452346265, "learning_rate": 1.9513333333333337e-06, "loss": 0.0002, "num_tokens": 1229321.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 76.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02342912368476391, "kl": 0.00029052793979644775, "learning_rate": 1.951e-06, "loss": 0.0, "num_tokens": 1229533.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 76.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.09676840901374817, "kl": 0.008191230474039912, "learning_rate": 1.950666666666667e-06, "loss": 0.0004, "num_tokens": 1229839.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 76.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09010336548089981, "kl": 0.005421877605840564, "learning_rate": 1.950333333333333e-06, "loss": 0.0003, "num_tokens": 1230073.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 76.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 7.130167007446289, "kl": 0.30865128501318395, "learning_rate": 1.95e-06, "loss": 0.2629, "num_tokens": 1230364.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 4151 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 76.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.0889623165130615, "kl": 0.06862466409802437, "learning_rate": 1.9496666666666667e-06, "loss": 0.0408, "num_tokens": 1230662.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 76.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 6.209710121154785, "kl": 0.05009952932596207, "learning_rate": 1.9493333333333335e-06, "loss": 0.2324, "num_tokens": 1231013.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 76.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06806277483701706, "kl": 0.06461020186543465, "learning_rate": 1.949e-06, "loss": 0.0032, "num_tokens": 1231399.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 76.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 1.9049837589263916, "kl": 0.0178663469851017, "learning_rate": 1.9486666666666667e-06, "loss": 0.0293, "num_tokens": 1231706.0, "reward": 5.125, "reward_std": 5.421792507171631, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 5.421792507171631, "step": 4155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 76.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.011730004101991653, "kl": 0.014396419283002615, "learning_rate": 1.9483333333333335e-06, "loss": 0.0007, "num_tokens": 1231966.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 76.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06506339460611343, "kl": 0.006239487323909998, "learning_rate": 1.948e-06, "loss": 0.0003, "num_tokens": 1232238.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 77.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.00833575427532196, "kl": 0.0010321637091692537, "learning_rate": 1.947666666666667e-06, "loss": 0.0001, "num_tokens": 1232547.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 77.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.18150325119495392, "kl": 0.03735784627497196, "learning_rate": 1.9473333333333334e-06, "loss": 0.0019, "num_tokens": 1232847.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 77.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.004244080279022455, "kl": 0.0013645078288391232, "learning_rate": 1.947e-06, "loss": 0.0001, "num_tokens": 1233127.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 77.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.2485275268554688, "kl": 0.03940440155565739, "learning_rate": 1.9466666666666665e-06, "loss": -0.0533, "num_tokens": 1233456.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 77.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.10510266572237015, "kl": 0.00682351685827598, "learning_rate": 1.9463333333333333e-06, "loss": 0.0003, "num_tokens": 1233722.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 77.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03601933270692825, "kl": 0.002549659227952361, "learning_rate": 1.946e-06, "loss": 0.0001, "num_tokens": 1234024.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 77.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.331364870071411, "kl": 0.29403063654899597, "learning_rate": 1.945666666666667e-06, "loss": 0.0277, "num_tokens": 1234329.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 77.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.4280591011047363, "kl": 0.7362418845295906, "learning_rate": 1.9453333333333337e-06, "loss": 0.0889, "num_tokens": 1234741.0, "reward": 1.625, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.6007810831069946, "step": 4165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 77.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.2602388858795166, "kl": 0.36333779245615005, "learning_rate": 1.945e-06, "loss": 0.0423, "num_tokens": 1235080.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 77.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.024190451949834824, "kl": 0.0007255449891090393, "learning_rate": 1.944666666666667e-06, "loss": 0.0, "num_tokens": 1235323.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 77.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.039306215941905975, "kl": 0.0032889824360609055, "learning_rate": 1.944333333333333e-06, "loss": 0.0002, "num_tokens": 1235620.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 77.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0383589006960392, "kl": 0.007975178305059671, "learning_rate": 1.944e-06, "loss": 0.0004, "num_tokens": 1235900.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 77.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.775885581970215, "kl": 0.13293201848864555, "learning_rate": 1.9436666666666667e-06, "loss": 0.2955, "num_tokens": 1236254.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 77.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.002294854959473014, "kl": 0.003396354615688324, "learning_rate": 1.9433333333333335e-06, "loss": 0.0002, "num_tokens": 1236490.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 77.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.7070120573043823, "kl": 0.041661586612463, "learning_rate": 1.943e-06, "loss": 0.002, "num_tokens": 1236759.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 77.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01104077510535717, "kl": 0.003519028425216675, "learning_rate": 1.9426666666666666e-06, "loss": 0.0002, "num_tokens": 1236975.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 77.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 6.336381435394287, "kl": 0.02045949501916766, "learning_rate": 1.9423333333333334e-06, "loss": 0.1234, "num_tokens": 1237276.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 77.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05067688226699829, "kl": 0.0012031823571305722, "learning_rate": 1.9419999999999998e-06, "loss": 0.0001, "num_tokens": 1237532.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 77.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.10434551537036896, "kl": 0.02033104095607996, "learning_rate": 1.941666666666667e-06, "loss": 0.001, "num_tokens": 1237862.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 77.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.882805347442627, "kl": 0.0539279232325498, "learning_rate": 1.9413333333333334e-06, "loss": -0.0388, "num_tokens": 1238138.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 77.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.013030019588768482, "kl": 0.00021567940711975098, "learning_rate": 1.941e-06, "loss": 0.0, "num_tokens": 1238350.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 77.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.609494686126709, "kl": 0.023763020522892475, "learning_rate": 1.9406666666666665e-06, "loss": 0.0798, "num_tokens": 1238622.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 77.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 7.1220316886901855, "kl": 0.12488710437901318, "learning_rate": 1.9403333333333333e-06, "loss": -0.0249, "num_tokens": 1238881.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 77.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08558971434831619, "kl": 0.0049365556333214045, "learning_rate": 1.94e-06, "loss": 0.0002, "num_tokens": 1239177.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 77.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.015483858063817024, "kl": 0.003704962902702391, "learning_rate": 1.939666666666667e-06, "loss": 0.0002, "num_tokens": 1239467.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 77.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.019819265231490135, "kl": 0.001352352846879512, "learning_rate": 1.9393333333333336e-06, "loss": 0.0001, "num_tokens": 1239735.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 55.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 77.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.8206136226654053, "kl": 0.03712160140275955, "learning_rate": 1.939e-06, "loss": 0.2776, "num_tokens": 1240179.0, "reward": 6.25, "reward_std": 2.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 2.5, "step": 4184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 77.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04253672435879707, "kl": 0.011192928068339825, "learning_rate": 1.9386666666666668e-06, "loss": 0.0006, "num_tokens": 1240513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 77.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003173008153680712, "kl": 6.728619337081909e-05, "learning_rate": 1.938333333333333e-06, "loss": 0.0, "num_tokens": 1240733.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 77.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02005135826766491, "kl": 0.002891370910219848, "learning_rate": 1.938e-06, "loss": 0.0001, "num_tokens": 1241017.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 77.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.3200020790100098, "kl": 0.08114099875092506, "learning_rate": 1.9376666666666667e-06, "loss": 0.133, "num_tokens": 1241399.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 4188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 77.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 4.443061351776123, "kl": 0.05532825365662575, "learning_rate": 1.9373333333333335e-06, "loss": 0.0109, "num_tokens": 1241705.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 77.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.029117180034518242, "kl": 0.09699800238013268, "learning_rate": 1.9370000000000003e-06, "loss": 0.0048, "num_tokens": 1242077.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 77.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011976920068264008, "kl": 6.387233770510647e-05, "learning_rate": 1.9366666666666666e-06, "loss": 0.0, "num_tokens": 1242297.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 77.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.93695330619812, "kl": 0.11477098986506462, "learning_rate": 1.9363333333333334e-06, "loss": -0.0766, "num_tokens": 1242646.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 4192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 77.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.39902207255363464, "kl": 0.07843837421387434, "learning_rate": 1.936e-06, "loss": 0.0037, "num_tokens": 1242942.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 77.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.2322455644607544, "kl": 0.04774339310824871, "learning_rate": 1.935666666666667e-06, "loss": 0.0025, "num_tokens": 1243302.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 77.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08009503036737442, "kl": 0.026713049970567226, "learning_rate": 1.9353333333333333e-06, "loss": 0.0013, "num_tokens": 1243604.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 77.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.02505462057888508, "kl": 0.0008030809694901109, "learning_rate": 1.935e-06, "loss": 0.0, "num_tokens": 1243866.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 77.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.004699228331446648, "kl": 0.0010910580749623477, "learning_rate": 1.9346666666666665e-06, "loss": 0.0001, "num_tokens": 1244178.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 77.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.041575249284505844, "kl": 0.011271217401372269, "learning_rate": 1.9343333333333333e-06, "loss": 0.0006, "num_tokens": 1244464.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 77.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.008147178217768669, "kl": 0.00954483076930046, "learning_rate": 1.934e-06, "loss": 0.0005, "num_tokens": 1244736.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 77.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.629997730255127, "kl": 0.8537364536896348, "learning_rate": 1.933666666666667e-06, "loss": 0.0701, "num_tokens": 1244997.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 77.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.014347920194268227, "kl": 0.0005748532712459564, "learning_rate": 1.9333333333333336e-06, "loss": 0.0, "num_tokens": 1245257.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 77.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.10747025907039642, "kl": 0.007933998480439186, "learning_rate": 1.933e-06, "loss": 0.0004, "num_tokens": 1245515.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 77.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.013548709452152252, "kl": 0.002063589170575142, "learning_rate": 1.9326666666666667e-06, "loss": 0.0001, "num_tokens": 1245827.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 77.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.16767403483390808, "kl": 0.030538485618308187, "learning_rate": 1.932333333333333e-06, "loss": 0.0014, "num_tokens": 1246147.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 77.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03566483408212662, "kl": 0.0016097147017717361, "learning_rate": 1.9320000000000003e-06, "loss": 0.0001, "num_tokens": 1246416.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 77.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009168892866000533, "kl": 2.0645558834075928e-05, "learning_rate": 1.9316666666666667e-06, "loss": 0.0, "num_tokens": 1246628.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 77.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.14718596637248993, "kl": 0.04609527066349983, "learning_rate": 1.9313333333333334e-06, "loss": 0.0024, "num_tokens": 1246982.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 77.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.308185338973999, "kl": 0.10179235972464085, "learning_rate": 1.9310000000000002e-06, "loss": 0.0051, "num_tokens": 1247360.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 77.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.932782173156738, "kl": 0.09911003150045872, "learning_rate": 1.9306666666666666e-06, "loss": 0.0901, "num_tokens": 1247644.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 77.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.011597269214689732, "kl": 0.00032469630241394043, "learning_rate": 1.9303333333333334e-06, "loss": 0.0, "num_tokens": 1247854.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 77.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.049700118601322174, "kl": 0.0016231692279689014, "learning_rate": 1.93e-06, "loss": 0.0001, "num_tokens": 1248088.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 78.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03323694318532944, "kl": 0.0014779643970541656, "learning_rate": 1.929666666666667e-06, "loss": 0.0001, "num_tokens": 1248413.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 78.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.14461404085159302, "kl": 0.032939229160547256, "learning_rate": 1.9293333333333333e-06, "loss": 0.0017, "num_tokens": 1248709.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 78.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.026254257187247276, "kl": 0.03676637168973684, "learning_rate": 1.929e-06, "loss": 0.0019, "num_tokens": 1249001.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 78.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.11635847389698029, "kl": 0.004655453558370937, "learning_rate": 1.9286666666666664e-06, "loss": 0.0002, "num_tokens": 1249257.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.054019249975681305, "kl": 0.0010344207403250039, "learning_rate": 1.9283333333333332e-06, "loss": 0.0, "num_tokens": 1249470.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 78.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0921623483300209, "kl": 0.01755582168698311, "learning_rate": 1.928e-06, "loss": 0.001, "num_tokens": 1249780.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 78.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.3513851165771484, "kl": 0.0199673967435956, "learning_rate": 1.927666666666667e-06, "loss": 0.0582, "num_tokens": 1250122.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 78.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.13223950564861298, "kl": 0.007777614053338766, "learning_rate": 1.9273333333333336e-06, "loss": 0.0004, "num_tokens": 1250439.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 78.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09461899846792221, "kl": 0.008546177297830582, "learning_rate": 1.927e-06, "loss": 0.0004, "num_tokens": 1250682.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 78.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.6419639587402344, "kl": 0.047966865822672844, "learning_rate": 1.9266666666666667e-06, "loss": -0.1356, "num_tokens": 1251046.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 4221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 78.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07959099113941193, "kl": 0.010591855272650719, "learning_rate": 1.926333333333333e-06, "loss": 0.0005, "num_tokens": 1251314.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 78.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.652174711227417, "kl": 0.10362341441214085, "learning_rate": 1.9260000000000003e-06, "loss": 0.0061, "num_tokens": 1251600.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 78.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.025179386138916, "kl": 0.1205407865345478, "learning_rate": 1.9256666666666666e-06, "loss": 0.0572, "num_tokens": 1251946.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 4224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 78.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.014520753175020218, "kl": 0.0024043945595622063, "learning_rate": 1.9253333333333334e-06, "loss": 0.0001, "num_tokens": 1252242.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 78.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.041846100240945816, "kl": 0.0011122730211354792, "learning_rate": 1.925e-06, "loss": 0.0001, "num_tokens": 1252518.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 78.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.003778262296691537, "kl": 8.817017078399658e-05, "learning_rate": 1.9246666666666666e-06, "loss": 0.0, "num_tokens": 1252730.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 78.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.08723165839910507, "kl": 0.023326152935624123, "learning_rate": 1.9243333333333333e-06, "loss": 0.001, "num_tokens": 1253053.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 78.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.033983197063207626, "kl": 0.009079038631170988, "learning_rate": 1.924e-06, "loss": 0.0004, "num_tokens": 1253342.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 78.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.4294841289520264, "kl": 0.05626204237341881, "learning_rate": 1.923666666666667e-06, "loss": -0.092, "num_tokens": 1253709.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 78.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.3585896492004395, "kl": 0.017691759392619133, "learning_rate": 1.9233333333333333e-06, "loss": 0.0667, "num_tokens": 1253992.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.75, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 78.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.4049912095069885, "kl": 0.06708768382668495, "learning_rate": 1.923e-06, "loss": 0.0034, "num_tokens": 1254375.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 78.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.013180306181311607, "kl": 0.0010220229742117226, "learning_rate": 1.9226666666666664e-06, "loss": 0.0001, "num_tokens": 1254695.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4233 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.007352941203862429, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 78.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.9212491512298584, "kl": 0.10197229124605656, "learning_rate": 1.922333333333333e-06, "loss": 0.0834, "num_tokens": 1255043.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 78.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.006084556225687265, "kl": 0.0003930516541004181, "learning_rate": 1.9220000000000004e-06, "loss": 0.0, "num_tokens": 1255303.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 78.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05273579806089401, "kl": 0.016235220013186336, "learning_rate": 1.9216666666666668e-06, "loss": 0.0009, "num_tokens": 1255619.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 78.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12951022386550903, "kl": 0.025173373520374298, "learning_rate": 1.9213333333333335e-06, "loss": 0.0013, "num_tokens": 1255919.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 78.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.005158380605280399, "kl": 0.2680702954530716, "learning_rate": 1.921e-06, "loss": 0.0134, "num_tokens": 1256223.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 78.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.31500211358070374, "kl": 0.037970778765156865, "learning_rate": 1.9206666666666667e-06, "loss": 0.0018, "num_tokens": 1256505.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 78.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.29990875720977783, "kl": 0.03192344726994634, "learning_rate": 1.9203333333333335e-06, "loss": 0.0016, "num_tokens": 1256776.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 78.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.599093437194824, "kl": 0.03590135369449854, "learning_rate": 1.9200000000000003e-06, "loss": 0.0341, "num_tokens": 1257115.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 78.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.008764144033193588, "kl": 0.009205047506839037, "learning_rate": 1.9196666666666666e-06, "loss": 0.0005, "num_tokens": 1257387.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 78.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03793429583311081, "kl": 0.0028215666534379125, "learning_rate": 1.9193333333333334e-06, "loss": 0.0001, "num_tokens": 1257689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.010651134885847569, "kl": 0.003851078450679779, "learning_rate": 1.919e-06, "loss": 0.0002, "num_tokens": 1257905.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003185986424796283, "kl": 6.905943155288696e-05, "learning_rate": 1.9186666666666665e-06, "loss": 0.0, "num_tokens": 1258125.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 78.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03568355366587639, "kl": 0.0044396971934475005, "learning_rate": 1.9183333333333333e-06, "loss": 0.0002, "num_tokens": 1258395.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 78.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 6.023728370666504, "kl": 0.024265441112220287, "learning_rate": 1.918e-06, "loss": 0.1552, "num_tokens": 1258663.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 78.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03483027592301369, "kl": 0.005986180156469345, "learning_rate": 1.917666666666667e-06, "loss": 0.0003, "num_tokens": 1258954.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 78.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 1.0063624382019043, "kl": 0.05899708718061447, "learning_rate": 1.9173333333333332e-06, "loss": 0.0032, "num_tokens": 1259220.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 78.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 7.068668365478516, "kl": 0.0071154829929582775, "learning_rate": 1.917e-06, "loss": 0.198, "num_tokens": 1259493.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.10169423371553421, "kl": 0.005140399909578264, "learning_rate": 1.9166666666666664e-06, "loss": 0.0003, "num_tokens": 1259710.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 78.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 4.462865352630615, "kl": 0.060255058109760284, "learning_rate": 1.9163333333333336e-06, "loss": -0.0394, "num_tokens": 1260072.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 78.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07884665578603745, "kl": 0.001622125506401062, "learning_rate": 1.9160000000000004e-06, "loss": 0.0001, "num_tokens": 1260276.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 78.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.038811977952718735, "kl": 0.002060721308225766, "learning_rate": 1.9156666666666667e-06, "loss": 0.0001, "num_tokens": 1260546.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 78.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 0.9585207104682922, "kl": 0.14760783314704895, "learning_rate": 1.9153333333333335e-06, "loss": 0.0345, "num_tokens": 1260954.0, "reward": 1.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 1.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 4255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 78.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.040985584259033, "kl": 0.0993734747171402, "learning_rate": 1.915e-06, "loss": 0.0031, "num_tokens": 1261326.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.001683708280324936, "kl": 0.0035288333892822266, "learning_rate": 1.9146666666666667e-06, "loss": 0.0002, "num_tokens": 1261562.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 78.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.24805141985416412, "kl": 0.07200213894248009, "learning_rate": 1.9143333333333334e-06, "loss": 0.0024, "num_tokens": 1261921.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 78.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00754777854308486, "kl": 0.0004747495841002092, "learning_rate": 1.9140000000000002e-06, "loss": 0.0, "num_tokens": 1262156.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 78.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.13387122750282288, "kl": 0.00983450561761856, "learning_rate": 1.9136666666666666e-06, "loss": 0.0005, "num_tokens": 1262414.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 78.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04390999674797058, "kl": 0.004116417956538498, "learning_rate": 1.9133333333333334e-06, "loss": 0.0002, "num_tokens": 1262696.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 78.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.18405263125896454, "kl": 0.04805176518857479, "learning_rate": 1.913e-06, "loss": 0.0024, "num_tokens": 1263022.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 78.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.680548906326294, "kl": 0.1030915305018425, "learning_rate": 1.9126666666666665e-06, "loss": 0.0631, "num_tokens": 1263343.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 4263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 78.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.013858833350241184, "kl": 0.16277816146612167, "learning_rate": 1.9123333333333333e-06, "loss": 0.0081, "num_tokens": 1263651.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 78.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.160861968994141, "kl": 0.12520997156389058, "learning_rate": 1.912e-06, "loss": 0.0064, "num_tokens": 1263927.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 4265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 79.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09493882209062576, "kl": 0.019914139062166214, "learning_rate": 1.911666666666667e-06, "loss": 0.001, "num_tokens": 1264246.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.005010351072996855, "kl": 0.0014900097157806158, "learning_rate": 1.9113333333333332e-06, "loss": 0.0001, "num_tokens": 1264526.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 79.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 10.642378807067871, "kl": 0.14753237552940845, "learning_rate": 1.911e-06, "loss": -0.0592, "num_tokens": 1264804.0, "reward": 6.625, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.0966243743896484, "step": 4268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 79.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.2820652723312378, "kl": 0.32148877531290054, "learning_rate": 1.9106666666666664e-06, "loss": 0.0041, "num_tokens": 1265173.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 79.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.5127744674682617, "kl": 0.08697371184825897, "learning_rate": 1.9103333333333336e-06, "loss": 0.0199, "num_tokens": 1265526.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 4270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 79.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06507395207881927, "kl": 0.007719706802163273, "learning_rate": 1.9100000000000003e-06, "loss": 0.0004, "num_tokens": 1265800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00924857147037983, "kl": 0.0010975684854201972, "learning_rate": 1.9096666666666667e-06, "loss": 0.0001, "num_tokens": 1266119.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 79.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.025987261906266212, "kl": 0.00031992196454666555, "learning_rate": 1.9093333333333335e-06, "loss": 0.0, "num_tokens": 1266332.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.025542592629790306, "kl": 0.006770234787836671, "learning_rate": 1.909e-06, "loss": 0.0003, "num_tokens": 1266621.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 79.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.15291957557201385, "kl": 0.008299468085169792, "learning_rate": 1.9086666666666666e-06, "loss": 0.0004, "num_tokens": 1266866.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 79.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.10867872834205627, "kl": 0.016073176288045943, "learning_rate": 1.9083333333333334e-06, "loss": 0.0008, "num_tokens": 1267195.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 76.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 79.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 1.4292832612991333, "kl": 0.08965224772691727, "learning_rate": 1.908e-06, "loss": 0.0318, "num_tokens": 1267742.0, "reward": 2.299999952316284, "reward_std": 6.58989143371582, "rewards/reward_combined/mean": 2.299999952316284, "rewards/reward_combined/std": 6.58989143371582, "step": 4277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 79.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03266540542244911, "kl": 0.001321264458965743, "learning_rate": 1.9076666666666666e-06, "loss": 0.0001, "num_tokens": 1268014.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 79.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.030367404222488403, "kl": 0.0009278854122385383, "learning_rate": 1.9073333333333333e-06, "loss": 0.0001, "num_tokens": 1268230.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 79.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.661116123199463, "kl": 0.08012433722615242, "learning_rate": 1.9070000000000001e-06, "loss": 0.0896, "num_tokens": 1268572.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 79.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.027522087097168, "kl": 0.030927312094718218, "learning_rate": 1.9066666666666667e-06, "loss": 0.0657, "num_tokens": 1268922.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 4281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 79.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.07460688799619675, "kl": 0.012812409084290266, "learning_rate": 1.9063333333333335e-06, "loss": 0.0006, "num_tokens": 1269251.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 79.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.1361246407032013, "kl": 0.015147800091654062, "learning_rate": 1.906e-06, "loss": 0.0007, "num_tokens": 1269520.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 79.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.38312283158302307, "kl": 0.0375064592808485, "learning_rate": 1.9056666666666668e-06, "loss": 0.0019, "num_tokens": 1269780.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 79.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06417759507894516, "kl": 0.013725708704441786, "learning_rate": 1.9053333333333332e-06, "loss": 0.0007, "num_tokens": 1270074.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 79.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0872919112443924, "kl": 0.037317905575037, "learning_rate": 1.905e-06, "loss": 0.0019, "num_tokens": 1270438.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 79.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06286707520484924, "kl": 0.003374706720933318, "learning_rate": 1.9046666666666665e-06, "loss": 0.0002, "num_tokens": 1270734.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 79.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003144270449411124, "kl": 7.129460573196411e-05, "learning_rate": 1.9043333333333333e-06, "loss": 0.0, "num_tokens": 1270954.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 79.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.031572021543979645, "kl": 0.0025535791646689177, "learning_rate": 1.9040000000000001e-06, "loss": 0.0001, "num_tokens": 1271208.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 79.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.994157791137695, "kl": 0.0037134106969460845, "learning_rate": 1.9036666666666667e-06, "loss": 0.1702, "num_tokens": 1271485.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 79.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.001585761085152626, "kl": 0.0035687386989593506, "learning_rate": 1.9033333333333335e-06, "loss": 0.0002, "num_tokens": 1271721.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 79.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07599655538797379, "kl": 0.0033965239708777517, "learning_rate": 1.903e-06, "loss": 0.0002, "num_tokens": 1271942.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4292 }, { "clip_ratio/high_max": 0.0055555556900799274, "clip_ratio/high_mean": 0.0055555556900799274, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0055555556900799274, "completion_length": 45.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 79.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.595132350921631, "kl": 0.26547348499298096, "learning_rate": 1.9026666666666668e-06, "loss": -0.0063, "num_tokens": 1272351.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 4293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 79.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.028872394934296608, "kl": 0.0007407168741337955, "learning_rate": 1.9023333333333332e-06, "loss": 0.0, "num_tokens": 1272563.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 79.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08227001875638962, "kl": 0.015970894135534763, "learning_rate": 1.9020000000000002e-06, "loss": 0.0008, "num_tokens": 1272865.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 79.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045527382753789425, "kl": 0.26818473637104034, "learning_rate": 1.9016666666666665e-06, "loss": 0.0134, "num_tokens": 1273169.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 79.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.004742157179862261, "kl": 0.01580418087542057, "learning_rate": 1.9013333333333333e-06, "loss": 0.0008, "num_tokens": 1273429.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 79.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04324128106236458, "kl": 0.008228184015024453, "learning_rate": 1.901e-06, "loss": 0.0004, "num_tokens": 1273715.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 79.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.3525957763195038, "kl": 0.03991496190428734, "learning_rate": 1.9006666666666667e-06, "loss": 0.0023, "num_tokens": 1273995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 79.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027048857882618904, "kl": 0.001616847701370716, "learning_rate": 1.9003333333333334e-06, "loss": 0.0001, "num_tokens": 1274307.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 79.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.18871232867240906, "kl": 0.02549402043223381, "learning_rate": 1.9e-06, "loss": 0.0014, "num_tokens": 1274591.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 79.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.10089059174060822, "kl": 0.011142004746943712, "learning_rate": 1.8996666666666668e-06, "loss": 0.0005, "num_tokens": 1274884.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.10412485152482986, "kl": 0.018473886884748936, "learning_rate": 1.8993333333333332e-06, "loss": 0.0009, "num_tokens": 1275172.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 79.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.021944435313344002, "kl": 0.002671961672604084, "learning_rate": 1.8990000000000002e-06, "loss": 0.0001, "num_tokens": 1275456.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4304 }, { "clip_ratio/high_max": 0.017126270104199648, "clip_ratio/high_mean": 0.017126270104199648, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017126270104199648, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 79.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.4490644931793213, "kl": 0.0364563032053411, "learning_rate": 1.8986666666666665e-06, "loss": 0.0091, "num_tokens": 1275826.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 79.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.09895407408475876, "kl": 0.05819042772054672, "learning_rate": 1.8983333333333333e-06, "loss": 0.003, "num_tokens": 1276238.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 79.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.053417712450027466, "kl": 0.0011590928334044293, "learning_rate": 1.898e-06, "loss": 0.0001, "num_tokens": 1276494.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 79.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.106990814208984, "kl": 0.04312201403081417, "learning_rate": 1.8976666666666667e-06, "loss": 0.0704, "num_tokens": 1276838.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 79.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.005180777050554752, "kl": 0.0009722212562337518, "learning_rate": 1.8973333333333334e-06, "loss": 0.0, "num_tokens": 1277150.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 79.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.06083007901906967, "kl": 0.03393148444592953, "learning_rate": 1.897e-06, "loss": 0.0017, "num_tokens": 1277452.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 79.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.23645886778831482, "kl": 0.022639069007709622, "learning_rate": 1.8966666666666668e-06, "loss": 0.0012, "num_tokens": 1277714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 79.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.11366508156061172, "kl": 0.027273572981357574, "learning_rate": 1.8963333333333331e-06, "loss": 0.0014, "num_tokens": 1278020.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 79.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.041096366941928864, "kl": 0.0013561142259277403, "learning_rate": 1.8960000000000001e-06, "loss": 0.0001, "num_tokens": 1278253.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 79.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.009637952782213688, "kl": 0.0044071972370147705, "learning_rate": 1.895666666666667e-06, "loss": 0.0002, "num_tokens": 1278469.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 8.735550880432129, "kl": 0.04328707233071327, "learning_rate": 1.8953333333333333e-06, "loss": 0.0801, "num_tokens": 1278746.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 79.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.17550866305828094, "kl": 0.027146801352500916, "learning_rate": 1.8950000000000003e-06, "loss": 0.0012, "num_tokens": 1279068.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 79.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.08663085103034973, "kl": 0.023260490968823433, "learning_rate": 1.8946666666666666e-06, "loss": 0.0012, "num_tokens": 1279397.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 79.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08788569271564484, "kl": 0.012014943495159969, "learning_rate": 1.8943333333333334e-06, "loss": 0.0006, "num_tokens": 1279669.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 79.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04950766637921333, "kl": 0.0012860782444477081, "learning_rate": 1.894e-06, "loss": 0.0001, "num_tokens": 1279929.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 80.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.2858528196811676, "kl": 0.0411251001060009, "learning_rate": 1.8936666666666668e-06, "loss": 0.0022, "num_tokens": 1280225.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 80.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.9411813020706177, "kl": 0.062322698533535004, "learning_rate": 1.8933333333333333e-06, "loss": 0.0176, "num_tokens": 1280631.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 4321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 80.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003150520788040012, "kl": 7.299333810806274e-05, "learning_rate": 1.8930000000000001e-06, "loss": 0.0, "num_tokens": 1280851.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 80.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.15832574665546417, "kl": 0.015317570883780718, "learning_rate": 1.892666666666667e-06, "loss": 0.0008, "num_tokens": 1281121.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 80.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05653592571616173, "kl": 0.004452500492334366, "learning_rate": 1.8923333333333333e-06, "loss": 0.0002, "num_tokens": 1281417.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 80.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.1150771901011467, "kl": 0.03150841686874628, "learning_rate": 1.8920000000000003e-06, "loss": 0.0015, "num_tokens": 1281756.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 80.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1055835708975792, "kl": 0.007861072663217783, "learning_rate": 1.8916666666666666e-06, "loss": 0.0004, "num_tokens": 1282054.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 80.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.039390482008457184, "kl": 0.0019255817751400173, "learning_rate": 1.8913333333333334e-06, "loss": 0.0001, "num_tokens": 1282324.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 80.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.018356727436184883, "kl": 0.004501785384491086, "learning_rate": 1.891e-06, "loss": 0.0002, "num_tokens": 1282615.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 80.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.006139532197266817, "kl": 0.0001577496514073573, "learning_rate": 1.8906666666666668e-06, "loss": 0.0, "num_tokens": 1282871.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 80.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.015639450401067734, "kl": 0.002255136496387422, "learning_rate": 1.8903333333333333e-06, "loss": 0.0001, "num_tokens": 1283173.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 80.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014675280544906855, "kl": 0.0035926103591918945, "learning_rate": 1.8900000000000001e-06, "loss": 0.0002, "num_tokens": 1283409.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 80.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 9.857666969299316, "kl": 0.0602201409637928, "learning_rate": 1.889666666666667e-06, "loss": 0.3233, "num_tokens": 1283628.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 80.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.1367505043745041, "kl": 0.006942069390788674, "learning_rate": 1.8893333333333333e-06, "loss": 0.0004, "num_tokens": 1283876.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 80.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051569328643381596, "kl": 0.0009948793449439108, "learning_rate": 1.8890000000000003e-06, "loss": 0.0, "num_tokens": 1284188.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 80.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04585277661681175, "kl": 0.0010584443807601929, "learning_rate": 1.8886666666666666e-06, "loss": 0.0, "num_tokens": 1284398.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 80.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.05766990780830383, "kl": 0.01633353903889656, "learning_rate": 1.8883333333333334e-06, "loss": 0.0008, "num_tokens": 1284720.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 80.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03653598576784134, "kl": 0.16410651803016663, "learning_rate": 1.888e-06, "loss": 0.0082, "num_tokens": 1285029.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 80.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.5428266525268555, "kl": 0.3339054733514786, "learning_rate": 1.8876666666666667e-06, "loss": 0.0343, "num_tokens": 1285334.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 80.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009092086926102638, "kl": 0.004559628665447235, "learning_rate": 1.8873333333333333e-06, "loss": 0.0002, "num_tokens": 1285550.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 80.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.17435891926288605, "kl": 0.006879238877445459, "learning_rate": 1.887e-06, "loss": 0.0004, "num_tokens": 1285763.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 80.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.17169271409511566, "kl": 0.05368756130337715, "learning_rate": 1.8866666666666669e-06, "loss": 0.0027, "num_tokens": 1286118.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 80.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.010143171064555645, "kl": 0.0006027743220329285, "learning_rate": 1.8863333333333335e-06, "loss": 0.0, "num_tokens": 1286378.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 80.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.433819532394409, "kl": 0.024248501285910606, "learning_rate": 1.8860000000000002e-06, "loss": 0.0367, "num_tokens": 1286712.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 80.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.003716086270287633, "kl": 0.000155717134475708, "learning_rate": 1.8856666666666666e-06, "loss": 0.0, "num_tokens": 1286956.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 80.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04067979007959366, "kl": 0.00615118513815105, "learning_rate": 1.8853333333333334e-06, "loss": 0.0003, "num_tokens": 1287233.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 80.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004789065103977919, "kl": 0.015812963247299194, "learning_rate": 1.885e-06, "loss": 0.0008, "num_tokens": 1287493.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 80.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.00406251847743988, "kl": 0.0017495816573500633, "learning_rate": 1.8846666666666667e-06, "loss": 0.0001, "num_tokens": 1287805.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 80.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103120744228363, "kl": 0.013902743812650442, "learning_rate": 1.8843333333333333e-06, "loss": 0.0009, "num_tokens": 1288081.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 80.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.2596316337585449, "kl": 0.05641137808561325, "learning_rate": 1.884e-06, "loss": 0.0028, "num_tokens": 1288398.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 80.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.6061272621154785, "kl": 0.06760421209037304, "learning_rate": 1.8836666666666669e-06, "loss": 0.0217, "num_tokens": 1288672.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 80.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.706523895263672, "kl": 0.04157466068863869, "learning_rate": 1.8833333333333334e-06, "loss": 0.1925, "num_tokens": 1289076.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4351 }, { "clip_ratio/high_max": 0.008064515888690948, "clip_ratio/high_mean": 0.008064515888690948, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008064515888690948, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 80.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.284354329109192, "kl": 0.08879881352186203, "learning_rate": 1.8830000000000002e-06, "loss": 0.0829, "num_tokens": 1289430.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 4352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 80.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07128682732582092, "kl": 0.030314982868731022, "learning_rate": 1.8826666666666666e-06, "loss": 0.0015, "num_tokens": 1289759.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 80.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.432647705078125, "kl": 0.2135235331952572, "learning_rate": 1.8823333333333334e-06, "loss": -0.1782, "num_tokens": 1290069.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 80.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.060464859008789, "kl": 0.1365949995815754, "learning_rate": 1.882e-06, "loss": 0.1264, "num_tokens": 1290447.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 4355 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 80.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.9406650066375732, "kl": 0.013447611592710018, "learning_rate": 1.8816666666666667e-06, "loss": -0.0992, "num_tokens": 1290725.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 80.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.10507752746343613, "kl": 0.01101256930269301, "learning_rate": 1.8813333333333333e-06, "loss": 0.0006, "num_tokens": 1290989.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 80.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.010743080638349056, "kl": 0.0011632859241217375, "learning_rate": 1.881e-06, "loss": 0.0001, "num_tokens": 1291309.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 80.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.22154352068901062, "kl": 0.012188445311039686, "learning_rate": 1.8806666666666669e-06, "loss": 0.0006, "num_tokens": 1291581.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 80.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 3.6668715476989746, "kl": 0.024648002348840237, "learning_rate": 1.8803333333333334e-06, "loss": 0.1614, "num_tokens": 1291934.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 4360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 80.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.005185931921005249, "kl": 0.00021985769853927195, "learning_rate": 1.8800000000000002e-06, "loss": 0.0, "num_tokens": 1292154.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 80.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.872429847717285, "kl": 0.03848284587729722, "learning_rate": 1.8796666666666666e-06, "loss": 0.0051, "num_tokens": 1292435.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 4362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 80.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.08073687553405762, "kl": 0.010062988847494125, "learning_rate": 1.8793333333333334e-06, "loss": 0.0005, "num_tokens": 1292693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 80.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.2071800231933594, "kl": 0.07048040628433228, "learning_rate": 1.879e-06, "loss": -0.0647, "num_tokens": 1293067.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 80.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.3753097653388977, "kl": 0.042253438383340836, "learning_rate": 1.8786666666666667e-06, "loss": 0.002, "num_tokens": 1293363.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 80.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.12837550044059753, "kl": 0.016710346564650536, "learning_rate": 1.8783333333333333e-06, "loss": 0.0008, "num_tokens": 1293690.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 80.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005399358458817005, "kl": 0.0003994263242930174, "learning_rate": 1.878e-06, "loss": 0.0, "num_tokens": 1293925.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 80.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.034773413091897964, "kl": 0.0151158763183048, "learning_rate": 1.8776666666666668e-06, "loss": 0.0008, "num_tokens": 1294212.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 80.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06649592518806458, "kl": 0.030844278633594513, "learning_rate": 1.8773333333333334e-06, "loss": 0.0015, "num_tokens": 1294496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 80.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0175692830234766, "kl": 0.0052419493440538645, "learning_rate": 1.8770000000000002e-06, "loss": 0.0003, "num_tokens": 1294764.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 80.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.08722560107707977, "kl": 0.08403240889310837, "learning_rate": 1.8766666666666666e-06, "loss": 0.0043, "num_tokens": 1295132.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 80.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09624875336885452, "kl": 0.013174053281545639, "learning_rate": 1.8763333333333336e-06, "loss": 0.0007, "num_tokens": 1295420.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 80.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.580333232879639, "kl": 0.1536710560321808, "learning_rate": 1.876e-06, "loss": 0.1715, "num_tokens": 1295755.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 4373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01680237427353859, "kl": 0.0023757058661431074, "learning_rate": 1.8756666666666667e-06, "loss": 0.0001, "num_tokens": 1296039.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 81.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03817089647054672, "kl": 0.0018637944594956934, "learning_rate": 1.8753333333333333e-06, "loss": 0.0001, "num_tokens": 1296309.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 81.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02150399424135685, "kl": 0.0009942147298716009, "learning_rate": 1.875e-06, "loss": 0.0, "num_tokens": 1296543.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 81.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 9.481693267822266, "kl": 0.01328006386756897, "learning_rate": 1.8746666666666668e-06, "loss": -0.027, "num_tokens": 1296813.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.564919948577881, "kl": 0.015312770381569862, "learning_rate": 1.8743333333333334e-06, "loss": 0.1962, "num_tokens": 1297126.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 81.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 8.664789199829102, "kl": 0.03134606145613361, "learning_rate": 1.8740000000000002e-06, "loss": 0.2246, "num_tokens": 1297354.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 81.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.006957850884646177, "kl": 0.0016645299619995058, "learning_rate": 1.8736666666666665e-06, "loss": 0.0001, "num_tokens": 1297634.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 81.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.008181389421224594, "kl": 0.0004776865243911743, "learning_rate": 1.8733333333333335e-06, "loss": 0.0, "num_tokens": 1297842.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 81.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10321664065122604, "kl": 0.011219854932278395, "learning_rate": 1.873e-06, "loss": 0.0005, "num_tokens": 1298107.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 81.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.4204522967338562, "kl": 0.10652109235525131, "learning_rate": 1.8726666666666667e-06, "loss": 0.0054, "num_tokens": 1298445.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 81.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.11199964582920074, "kl": 0.02174329198896885, "learning_rate": 1.8723333333333333e-06, "loss": 0.0011, "num_tokens": 1298767.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 81.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0554036907851696, "kl": 0.00952942413277924, "learning_rate": 1.872e-06, "loss": 0.0005, "num_tokens": 1299056.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 81.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.3276974856853485, "kl": 0.04634904861450195, "learning_rate": 1.8716666666666668e-06, "loss": 0.0025, "num_tokens": 1299342.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.006416888441890478, "kl": 0.010424769949167967, "learning_rate": 1.8713333333333334e-06, "loss": 0.0005, "num_tokens": 1299614.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 81.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011799843050539494, "kl": 0.0036661922931671143, "learning_rate": 1.8710000000000002e-06, "loss": 0.0002, "num_tokens": 1299850.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004587155766785145, "clip_ratio/low_min": 0.004587155766785145, "clip_ratio/region_mean": 0.004587155766785145, "completion_length": 46.25, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 81.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.3603315353393555, "kl": 0.1219240315258503, "learning_rate": 1.8706666666666667e-06, "loss": -0.0149, "num_tokens": 1300251.0, "reward": 3.25, "reward_std": 3.3040380477905273, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 3.3040380477905273, "step": 4389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 81.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.05270630866289139, "kl": 0.04471280239522457, "learning_rate": 1.8703333333333335e-06, "loss": 0.0022, "num_tokens": 1300663.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 81.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0057082148268818855, "kl": 0.01567525276914239, "learning_rate": 1.8699999999999999e-06, "loss": 0.0008, "num_tokens": 1300923.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 81.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003464862238615751, "kl": 6.67572021484375e-05, "learning_rate": 1.8696666666666667e-06, "loss": 0.0, "num_tokens": 1301143.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 81.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.232277512550354, "kl": 0.020496641693171114, "learning_rate": 1.8693333333333332e-06, "loss": 0.0013, "num_tokens": 1301424.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09789823740720749, "kl": 0.016767382621765137, "learning_rate": 1.869e-06, "loss": 0.0008, "num_tokens": 1301713.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 81.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.09358594566583633, "kl": 0.0170522122643888, "learning_rate": 1.8686666666666668e-06, "loss": 0.0008, "num_tokens": 1302043.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 81.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.6361145377159119, "kl": 0.02818980673328042, "learning_rate": 1.8683333333333334e-06, "loss": 0.0014, "num_tokens": 1302374.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 81.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.19904351234436035, "kl": 0.023975819582119584, "learning_rate": 1.8680000000000002e-06, "loss": 0.0014, "num_tokens": 1302648.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 81.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.7266082763671875, "kl": 0.043629519641399384, "learning_rate": 1.8676666666666667e-06, "loss": 0.0686, "num_tokens": 1303009.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 81.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03461923822760582, "kl": 0.001159273087978363, "learning_rate": 1.8673333333333335e-06, "loss": 0.0001, "num_tokens": 1303269.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 81.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03138570860028267, "kl": 0.00265646877232939, "learning_rate": 1.8669999999999999e-06, "loss": 0.0001, "num_tokens": 1303580.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 81.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.10708634555339813, "kl": 0.03524964302778244, "learning_rate": 1.8666666666666667e-06, "loss": 0.0018, "num_tokens": 1303908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 81.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.13446207344532013, "kl": 0.03038789052516222, "learning_rate": 1.8663333333333332e-06, "loss": 0.0016, "num_tokens": 1304208.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 81.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.0122764110565186, "kl": 0.15331338718533516, "learning_rate": 1.866e-06, "loss": -0.0109, "num_tokens": 1304578.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 81.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.3191465139389038, "kl": 0.0944865271449089, "learning_rate": 1.8656666666666668e-06, "loss": 0.0047, "num_tokens": 1304965.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 81.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.34566548466682434, "kl": 0.016131113283336163, "learning_rate": 1.8653333333333334e-06, "loss": 0.0011, "num_tokens": 1305181.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 81.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.025926288217306137, "kl": 0.00026322901248931885, "learning_rate": 1.8650000000000001e-06, "loss": 0.0, "num_tokens": 1305394.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 81.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08051861822605133, "kl": 0.008359687402844429, "learning_rate": 1.8646666666666667e-06, "loss": 0.0004, "num_tokens": 1305658.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.021630890667438507, "kl": 0.023254934698343277, "learning_rate": 1.8643333333333335e-06, "loss": 0.0013, "num_tokens": 1305947.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.014926270581781864, "kl": 0.0019489709520712495, "learning_rate": 1.8639999999999999e-06, "loss": 0.0001, "num_tokens": 1306231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 81.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.2556963860988617, "kl": 0.07968771830201149, "learning_rate": 1.8636666666666666e-06, "loss": 0.0034, "num_tokens": 1306554.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 81.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.685092031955719, "kl": 0.10645676963031292, "learning_rate": 1.8633333333333332e-06, "loss": 0.0054, "num_tokens": 1306899.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 81.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03368373587727547, "kl": 0.005524386069737375, "learning_rate": 1.863e-06, "loss": 0.0003, "num_tokens": 1307201.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 81.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.023091167211532593, "kl": 0.0023625462781637907, "learning_rate": 1.8626666666666668e-06, "loss": 0.0001, "num_tokens": 1307461.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 81.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.009918338619172573, "kl": 0.0012138157617300749, "learning_rate": 1.8623333333333333e-06, "loss": 0.0001, "num_tokens": 1307721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 81.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07133351266384125, "kl": 0.16373159736394882, "learning_rate": 1.8620000000000001e-06, "loss": 0.0082, "num_tokens": 1308032.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 81.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.798025608062744, "kl": 0.10175292007625103, "learning_rate": 1.8616666666666667e-06, "loss": 0.1123, "num_tokens": 1308341.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 81.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.04014373570680618, "kl": 0.003596288152039051, "learning_rate": 1.8613333333333335e-06, "loss": 0.0002, "num_tokens": 1308653.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 81.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.1125853061676025, "kl": 0.09719396103173494, "learning_rate": 1.8609999999999998e-06, "loss": -0.0585, "num_tokens": 1309036.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 81.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0448373481631279, "kl": 0.016422273591160774, "learning_rate": 1.8606666666666668e-06, "loss": 0.0008, "num_tokens": 1309341.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 81.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.010965530760586262, "kl": 0.00019196867651771754, "learning_rate": 1.8603333333333332e-06, "loss": 0.0, "num_tokens": 1309597.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 81.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08737992495298386, "kl": 0.03834068216383457, "learning_rate": 1.86e-06, "loss": 0.0019, "num_tokens": 1309872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 81.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.8967413902282715, "kl": 0.011913027847185731, "learning_rate": 1.8596666666666668e-06, "loss": -0.001, "num_tokens": 1310202.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 81.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.008811669424176216, "kl": 0.004763320088386536, "learning_rate": 1.8593333333333333e-06, "loss": 0.0002, "num_tokens": 1310418.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 81.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01139820646494627, "kl": 0.0010455269366502762, "learning_rate": 1.8590000000000001e-06, "loss": 0.0001, "num_tokens": 1310736.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 76.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 81.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.7842226028442383, "kl": 0.010343640809878707, "learning_rate": 1.8586666666666667e-06, "loss": 0.4647, "num_tokens": 1311260.0, "reward": 6.300000190734863, "reward_std": 2.4000000953674316, "rewards/reward_combined/mean": 6.300000190734863, "rewards/reward_combined/std": 2.3999998569488525, "step": 4425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 81.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0674179419875145, "kl": 0.004618597333319485, "learning_rate": 1.8583333333333335e-06, "loss": 0.0002, "num_tokens": 1311503.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 81.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.542253017425537, "kl": 0.3521061260253191, "learning_rate": 1.8579999999999998e-06, "loss": 0.0615, "num_tokens": 1311803.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 82.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021548960357904434, "kl": 0.2686252146959305, "learning_rate": 1.8576666666666668e-06, "loss": 0.0134, "num_tokens": 1312107.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 82.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07663135975599289, "kl": 0.006256932392716408, "learning_rate": 1.8573333333333332e-06, "loss": 0.0003, "num_tokens": 1312379.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 82.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.25091290473938, "kl": 0.42687382688745856, "learning_rate": 1.857e-06, "loss": -0.0498, "num_tokens": 1312718.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 82.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.052096232771873474, "kl": 0.002888257906306535, "learning_rate": 1.8566666666666667e-06, "loss": 0.0001, "num_tokens": 1312988.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 82.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06352750957012177, "kl": 0.003940177790354937, "learning_rate": 1.8563333333333333e-06, "loss": 0.0002, "num_tokens": 1313231.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 82.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.22415760159492493, "kl": 0.023728660540655255, "learning_rate": 1.856e-06, "loss": 0.0014, "num_tokens": 1313558.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 82.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00037158382474444807, "kl": 5.504488945007324e-05, "learning_rate": 1.8556666666666667e-06, "loss": 0.0, "num_tokens": 1313778.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 82.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.011057434603571892, "kl": 0.0010029137483797967, "learning_rate": 1.8553333333333335e-06, "loss": 0.0001, "num_tokens": 1314098.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 82.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.2809027433395386, "kl": 0.020557444542646408, "learning_rate": 1.8549999999999998e-06, "loss": 0.0011, "num_tokens": 1314309.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 82.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.06133417412638664, "kl": 0.26321327686309814, "learning_rate": 1.8546666666666668e-06, "loss": 0.0131, "num_tokens": 1314615.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 82.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014822466764599085, "kl": 0.003606371581554413, "learning_rate": 1.8543333333333332e-06, "loss": 0.0002, "num_tokens": 1314851.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 82.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.05624133348464966, "kl": 0.00447840424021706, "learning_rate": 1.854e-06, "loss": 0.0002, "num_tokens": 1315121.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 82.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.10262904316186905, "kl": 0.1373457908630371, "learning_rate": 1.853666666666667e-06, "loss": 0.0069, "num_tokens": 1315493.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 82.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 7.449041843414307, "kl": 0.022232317132875323, "learning_rate": 1.8533333333333333e-06, "loss": 0.0304, "num_tokens": 1315822.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 82.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07518257200717926, "kl": 0.012855518143624067, "learning_rate": 1.853e-06, "loss": 0.0006, "num_tokens": 1316110.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 82.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06869995594024658, "kl": 0.057380760088562965, "learning_rate": 1.8526666666666667e-06, "loss": 0.0029, "num_tokens": 1316454.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 82.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.12256380915641785, "kl": 0.01273546414449811, "learning_rate": 1.8523333333333334e-06, "loss": 0.0006, "num_tokens": 1316721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 82.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.007562916725873947, "kl": 0.0012484967592172325, "learning_rate": 1.852e-06, "loss": 0.0001, "num_tokens": 1316941.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 82.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.22549545764923096, "kl": 0.06006450392305851, "learning_rate": 1.8516666666666668e-06, "loss": 0.0033, "num_tokens": 1317256.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 82.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.17211684584617615, "kl": 0.027469228953123093, "learning_rate": 1.8513333333333336e-06, "loss": 0.0016, "num_tokens": 1317538.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 82.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.20630426704883575, "kl": 0.051430992782115936, "learning_rate": 1.851e-06, "loss": 0.0025, "num_tokens": 1317811.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 82.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.041266996413469315, "kl": 0.0024755297927185893, "learning_rate": 1.850666666666667e-06, "loss": 0.0001, "num_tokens": 1318071.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 82.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.600343227386475, "kl": 0.05811155028641224, "learning_rate": 1.8503333333333333e-06, "loss": 0.2471, "num_tokens": 1318386.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 4450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 82.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07347581535577774, "kl": 0.01644020201638341, "learning_rate": 1.85e-06, "loss": 0.0009, "num_tokens": 1318657.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 82.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04788690060377121, "kl": 0.009458801476284862, "learning_rate": 1.8496666666666666e-06, "loss": 0.0005, "num_tokens": 1318953.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 82.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.008286603726446629, "kl": 0.0015786755830049515, "learning_rate": 1.8493333333333334e-06, "loss": 0.0001, "num_tokens": 1319265.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 82.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.15357807278633118, "kl": 0.00651755859144032, "learning_rate": 1.849e-06, "loss": 0.0003, "num_tokens": 1319499.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 82.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.13082171976566315, "kl": 0.02668852312490344, "learning_rate": 1.8486666666666668e-06, "loss": 0.0014, "num_tokens": 1319806.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 82.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.19236665964126587, "kl": 0.03527451306581497, "learning_rate": 1.8483333333333336e-06, "loss": 0.0018, "num_tokens": 1320161.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 82.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.0416932106018066, "kl": 0.06623013317584991, "learning_rate": 1.848e-06, "loss": 0.0051, "num_tokens": 1320565.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 4457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 82.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04172458499670029, "kl": 0.0007762670575175434, "learning_rate": 1.847666666666667e-06, "loss": 0.0, "num_tokens": 1320821.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 82.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06234841048717499, "kl": 0.02629727590829134, "learning_rate": 1.8473333333333333e-06, "loss": 0.0013, "num_tokens": 1321175.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 82.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03425826132297516, "kl": 0.164263017475605, "learning_rate": 1.847e-06, "loss": 0.0082, "num_tokens": 1321484.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 82.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.009655088186264038, "kl": 0.004441611468791962, "learning_rate": 1.8466666666666666e-06, "loss": 0.0002, "num_tokens": 1321700.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006172839552164078, "clip_ratio/low_min": 0.006172839552164078, "clip_ratio/region_mean": 0.006172839552164078, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 82.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.3002357482910156, "kl": 0.04747505113482475, "learning_rate": 1.8463333333333334e-06, "loss": -0.0896, "num_tokens": 1322098.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 82.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01681666262447834, "kl": 0.03840895835310221, "learning_rate": 1.846e-06, "loss": 0.002, "num_tokens": 1322390.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 82.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.687446117401123, "kl": 0.03372030612081289, "learning_rate": 1.8456666666666668e-06, "loss": 0.0906, "num_tokens": 1322694.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 82.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05643114820122719, "kl": 0.0071255359798669815, "learning_rate": 1.8453333333333336e-06, "loss": 0.0004, "num_tokens": 1323000.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 82.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.10460171103477478, "kl": 0.0030441894195973873, "learning_rate": 1.8450000000000001e-06, "loss": 0.0002, "num_tokens": 1323213.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 82.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.019232025370001793, "kl": 0.004105325788259506, "learning_rate": 1.844666666666667e-06, "loss": 0.0002, "num_tokens": 1323504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 82.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.013890565373003483, "kl": 0.0019186849240213633, "learning_rate": 1.8443333333333333e-06, "loss": 0.0001, "num_tokens": 1323788.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 82.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02797258459031582, "kl": 0.007735855877399445, "learning_rate": 1.844e-06, "loss": 0.0004, "num_tokens": 1324120.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 82.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.031194305047392845, "kl": 0.005104768555611372, "learning_rate": 1.8436666666666666e-06, "loss": 0.0002, "num_tokens": 1324378.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 82.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.02669372223317623, "kl": 0.0013609528541564941, "learning_rate": 1.8433333333333334e-06, "loss": 0.0001, "num_tokens": 1324590.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 82.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036326637491583824, "kl": 0.0005698055028915405, "learning_rate": 1.843e-06, "loss": 0.0, "num_tokens": 1324850.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 82.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.212266743183136, "kl": 0.07454855367541313, "learning_rate": 1.8426666666666668e-06, "loss": 0.0037, "num_tokens": 1325235.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 82.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.006403028033673763, "kl": 0.0016177594079636037, "learning_rate": 1.8423333333333335e-06, "loss": 0.0001, "num_tokens": 1325515.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 82.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09204453229904175, "kl": 0.03152099810540676, "learning_rate": 1.8420000000000001e-06, "loss": 0.0016, "num_tokens": 1325817.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4475 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 82.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.091361999511719, "kl": 0.0465207826346159, "learning_rate": 1.8416666666666669e-06, "loss": -0.067, "num_tokens": 1326158.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 82.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00873350165784359, "kl": 0.0015225483803078532, "learning_rate": 1.8413333333333333e-06, "loss": 0.0001, "num_tokens": 1326464.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 82.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.20845302939414978, "kl": 0.02591664995998144, "learning_rate": 1.841e-06, "loss": 0.0012, "num_tokens": 1326753.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 82.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.028067268431186676, "kl": 0.004073692311067134, "learning_rate": 1.8406666666666666e-06, "loss": 0.0002, "num_tokens": 1327019.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 82.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.4394779205322266, "kl": 0.06088544428348541, "learning_rate": 1.8403333333333334e-06, "loss": -0.0382, "num_tokens": 1327374.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 4480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 82.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.835725784301758, "kl": 0.18542409967631102, "learning_rate": 1.84e-06, "loss": 0.0704, "num_tokens": 1327662.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 83.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.00518677057698369, "kl": 0.015772895887494087, "learning_rate": 1.8396666666666667e-06, "loss": 0.0008, "num_tokens": 1327922.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06541557610034943, "kl": 0.004301687586121261, "learning_rate": 1.8393333333333335e-06, "loss": 0.0002, "num_tokens": 1328218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 83.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.420662522315979, "kl": 0.055525410920381546, "learning_rate": 1.839e-06, "loss": 0.0638, "num_tokens": 1328593.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 83.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.934332847595215, "kl": 0.1555764600634575, "learning_rate": 1.8386666666666669e-06, "loss": -0.0027, "num_tokens": 1328959.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 4485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 83.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.17491045594215393, "kl": 0.012714287266135216, "learning_rate": 1.8383333333333332e-06, "loss": 0.0007, "num_tokens": 1329226.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 83.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.1922261267900467, "kl": 0.031621651723980904, "learning_rate": 1.838e-06, "loss": 0.0016, "num_tokens": 1329564.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 83.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1284618079662323, "kl": 0.0060302456840872765, "learning_rate": 1.8376666666666666e-06, "loss": 0.0003, "num_tokens": 1329828.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 83.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1297995150089264, "kl": 0.045892647467553616, "learning_rate": 1.8373333333333334e-06, "loss": 0.002, "num_tokens": 1330155.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 83.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.18180488049983978, "kl": 0.028762775473296642, "learning_rate": 1.837e-06, "loss": 0.0015, "num_tokens": 1330448.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4490 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.009615384973585606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 83.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.960066080093384, "kl": 0.04335315991193056, "learning_rate": 1.8366666666666667e-06, "loss": -0.0992, "num_tokens": 1330764.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 4491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.021585283800959587, "kl": 0.0071603478863835335, "learning_rate": 1.8363333333333335e-06, "loss": 0.0004, "num_tokens": 1331032.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.01160829234868288, "kl": 0.0013812032993882895, "learning_rate": 1.836e-06, "loss": 0.0001, "num_tokens": 1331302.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 83.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.022795287892222404, "kl": 0.0024616834707558155, "learning_rate": 1.8356666666666669e-06, "loss": 0.0001, "num_tokens": 1331586.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 83.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.09836888313293457, "kl": 0.008340646279975772, "learning_rate": 1.8353333333333332e-06, "loss": 0.0003, "num_tokens": 1331840.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 83.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.008560781367123127, "kl": 0.009677237831056118, "learning_rate": 1.8350000000000002e-06, "loss": 0.0005, "num_tokens": 1332112.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 83.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0253881998360157, "kl": 0.0003055781126022339, "learning_rate": 1.8346666666666666e-06, "loss": 0.0, "num_tokens": 1332368.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 83.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.34124791622161865, "kl": 0.06942594796419144, "learning_rate": 1.8343333333333334e-06, "loss": 0.0035, "num_tokens": 1332640.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 83.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.00034384283935651183, "kl": 6.54458999633789e-05, "learning_rate": 1.834e-06, "loss": 0.0, "num_tokens": 1332860.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 83.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.043943312019109726, "kl": 0.0016277527902275324, "learning_rate": 1.8336666666666667e-06, "loss": 0.0001, "num_tokens": 1333136.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 83.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02193913236260414, "kl": 0.0007171332836151123, "learning_rate": 1.8333333333333335e-06, "loss": 0.0, "num_tokens": 1333348.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 83.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.12639886140823364, "kl": 0.03929758816957474, "learning_rate": 1.833e-06, "loss": 0.002, "num_tokens": 1333646.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 83.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.30518773198127747, "kl": 0.0242879445431754, "learning_rate": 1.8326666666666669e-06, "loss": 0.0012, "num_tokens": 1333950.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 83.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.23015724122524261, "kl": 0.08431190066039562, "learning_rate": 1.8323333333333332e-06, "loss": 0.0042, "num_tokens": 1334353.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03932933136820793, "kl": 0.006077993428334594, "learning_rate": 1.8320000000000002e-06, "loss": 0.0003, "num_tokens": 1334630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 83.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09884496033191681, "kl": 0.0037670237943530083, "learning_rate": 1.8316666666666666e-06, "loss": 0.0002, "num_tokens": 1334852.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 83.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015210869023576379, "kl": 0.003596954047679901, "learning_rate": 1.8313333333333333e-06, "loss": 0.0002, "num_tokens": 1335088.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 83.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.018884696066379547, "kl": 0.0008276030421257019, "learning_rate": 1.831e-06, "loss": 0.0, "num_tokens": 1335300.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 83.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.681485176086426, "kl": 0.1734844595193863, "learning_rate": 1.8306666666666667e-06, "loss": 0.0868, "num_tokens": 1335615.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 83.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.9553044438362122, "kl": 0.08474664390087128, "learning_rate": 1.8303333333333335e-06, "loss": 0.0042, "num_tokens": 1335831.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 83.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.020320512354373932, "kl": 0.0029306603828445077, "learning_rate": 1.83e-06, "loss": 0.0001, "num_tokens": 1336066.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 83.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.015633560717105865, "kl": 0.002356857992708683, "learning_rate": 1.8296666666666668e-06, "loss": 0.0001, "num_tokens": 1336378.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 83.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 4.939454555511475, "kl": 0.5071391463279724, "learning_rate": 1.8293333333333332e-06, "loss": 0.242, "num_tokens": 1336746.0, "reward": 7.5, "reward_std": 1.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 1.0, "step": 4513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 83.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.015135765075684, "kl": 0.15286946669220924, "learning_rate": 1.8290000000000002e-06, "loss": 0.019, "num_tokens": 1337097.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 83.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02253352664411068, "kl": 0.0007894709706306458, "learning_rate": 1.8286666666666666e-06, "loss": 0.0, "num_tokens": 1337357.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 83.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.037067148834466934, "kl": 0.004079599282704294, "learning_rate": 1.8283333333333333e-06, "loss": 0.0002, "num_tokens": 1337689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 83.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.16268889605998993, "kl": 0.015642430866137147, "learning_rate": 1.828e-06, "loss": 0.0008, "num_tokens": 1338017.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 83.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796192526817322, "kl": 0.022601601667702198, "learning_rate": 1.8276666666666667e-06, "loss": 0.0012, "num_tokens": 1338307.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 83.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.056297432631254196, "kl": 0.008035860490053892, "learning_rate": 1.8273333333333335e-06, "loss": 0.0004, "num_tokens": 1338605.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 83.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0612456314265728, "kl": 0.025490344502031803, "learning_rate": 1.827e-06, "loss": 0.0014, "num_tokens": 1338894.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 83.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08116748183965683, "kl": 0.004820789908990264, "learning_rate": 1.8266666666666668e-06, "loss": 0.0002, "num_tokens": 1339156.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.1013159528374672, "kl": 0.011585609056055546, "learning_rate": 1.8263333333333334e-06, "loss": 0.0006, "num_tokens": 1339428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 83.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.1381785124540329, "kl": 0.02544863522052765, "learning_rate": 1.8260000000000002e-06, "loss": 0.0013, "num_tokens": 1339724.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 83.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05593863129615784, "kl": 0.007262100873049349, "learning_rate": 1.8256666666666665e-06, "loss": 0.0004, "num_tokens": 1340047.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 83.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.06183823570609093, "kl": 0.030166875571012497, "learning_rate": 1.8253333333333333e-06, "loss": 0.0015, "num_tokens": 1340399.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 83.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.005003071390092373, "kl": 0.001004202465992421, "learning_rate": 1.8249999999999999e-06, "loss": 0.0001, "num_tokens": 1340711.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 83.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.00254241144284606, "kl": 0.26853884756565094, "learning_rate": 1.8246666666666667e-06, "loss": 0.0134, "num_tokens": 1341015.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 83.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10519613325595856, "kl": 0.004050101386383176, "learning_rate": 1.8243333333333335e-06, "loss": 0.0003, "num_tokens": 1341226.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 83.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.7129215002059937, "kl": 0.19857073947787285, "learning_rate": 1.824e-06, "loss": 0.0099, "num_tokens": 1341598.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 83.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.001951477024704218, "kl": 0.0001305900514125824, "learning_rate": 1.8236666666666668e-06, "loss": 0.0, "num_tokens": 1341842.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 83.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05018823966383934, "kl": 0.005668552126735449, "learning_rate": 1.8233333333333334e-06, "loss": 0.0002, "num_tokens": 1342121.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 83.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.5049600005149841, "kl": 0.06107555702328682, "learning_rate": 1.8230000000000002e-06, "loss": 0.003, "num_tokens": 1342448.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 83.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.24273280799388885, "kl": 0.06250195764005184, "learning_rate": 1.8226666666666665e-06, "loss": 0.0022, "num_tokens": 1342833.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 83.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07775144279003143, "kl": 0.011150154285132885, "learning_rate": 1.8223333333333333e-06, "loss": 0.0006, "num_tokens": 1343127.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 83.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.028379112482070923, "kl": 0.001139858883107081, "learning_rate": 1.8219999999999999e-06, "loss": 0.0001, "num_tokens": 1343397.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 84.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.006011796649545431, "kl": 0.015576763078570366, "learning_rate": 1.8216666666666667e-06, "loss": 0.0008, "num_tokens": 1343657.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 84.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0366351380944252, "kl": 0.002985842409543693, "learning_rate": 1.8213333333333334e-06, "loss": 0.0001, "num_tokens": 1343939.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 84.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.3243839740753174, "kl": 0.24699855595827103, "learning_rate": 1.821e-06, "loss": -0.0631, "num_tokens": 1344310.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 84.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.09131787717342377, "kl": 0.021963478066027164, "learning_rate": 1.8206666666666668e-06, "loss": 0.0012, "num_tokens": 1344586.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 84.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06585431843996048, "kl": 0.0061761485412716866, "learning_rate": 1.8203333333333334e-06, "loss": 0.0003, "num_tokens": 1344889.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 84.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.23800045251846313, "kl": 0.010036620311439037, "learning_rate": 1.8200000000000002e-06, "loss": 0.0005, "num_tokens": 1345107.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 84.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.5954710841178894, "kl": 0.0889275036752224, "learning_rate": 1.8196666666666665e-06, "loss": 0.0044, "num_tokens": 1345480.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 84.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.026880059391260147, "kl": 0.04410934820771217, "learning_rate": 1.8193333333333335e-06, "loss": 0.0022, "num_tokens": 1345892.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 84.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07424087077379227, "kl": 0.008653692668303847, "learning_rate": 1.8189999999999999e-06, "loss": 0.0004, "num_tokens": 1346166.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 84.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003423725429456681, "kl": 6.1817467212677e-05, "learning_rate": 1.8186666666666666e-06, "loss": 0.0, "num_tokens": 1346386.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 84.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.04150363802909851, "kl": 0.0023117363452911377, "learning_rate": 1.8183333333333334e-06, "loss": 0.0001, "num_tokens": 1346620.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 84.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.09289748966693878, "kl": 0.12808343768119812, "learning_rate": 1.818e-06, "loss": 0.0064, "num_tokens": 1346992.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 84.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.6736209392547607, "kl": 0.04916405491530895, "learning_rate": 1.8176666666666668e-06, "loss": 0.1031, "num_tokens": 1347304.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.009528686292469501, "kl": 0.009169904980808496, "learning_rate": 1.8173333333333334e-06, "loss": 0.0005, "num_tokens": 1347576.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 84.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 5.0630974769592285, "kl": 0.024663350079208612, "learning_rate": 1.8170000000000001e-06, "loss": 0.0756, "num_tokens": 1347912.0, "reward": 5.550000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 5.550000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 4550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 84.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.21899208426475525, "kl": 0.023449061438441277, "learning_rate": 1.8166666666666665e-06, "loss": 0.0012, "num_tokens": 1348239.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 84.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.1031763106584549, "kl": 0.013997962232679129, "learning_rate": 1.8163333333333335e-06, "loss": 0.0007, "num_tokens": 1348527.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 84.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.006296938750892878, "kl": 0.00042418017983436584, "learning_rate": 1.8159999999999999e-06, "loss": 0.0, "num_tokens": 1348787.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 84.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.030918868258595467, "kl": 0.005808588815853, "learning_rate": 1.8156666666666666e-06, "loss": 0.0003, "num_tokens": 1349121.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02945765294134617, "kl": 0.0069765131920576096, "learning_rate": 1.8153333333333334e-06, "loss": 0.0003, "num_tokens": 1349401.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 84.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.002900908002629876, "kl": 0.00014244019985198975, "learning_rate": 1.815e-06, "loss": 0.0, "num_tokens": 1349645.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.17949199676513672, "kl": 0.03702153544872999, "learning_rate": 1.8146666666666668e-06, "loss": 0.0019, "num_tokens": 1349915.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 84.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016543844249099493, "kl": 0.003559909760951996, "learning_rate": 1.8143333333333333e-06, "loss": 0.0002, "num_tokens": 1350151.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 84.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.062095727771520615, "kl": 0.01190496701747179, "learning_rate": 1.8140000000000001e-06, "loss": 0.0006, "num_tokens": 1350405.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09647462517023087, "kl": 0.014620958943851292, "learning_rate": 1.8136666666666665e-06, "loss": 0.0007, "num_tokens": 1350693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 84.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.007919061928987503, "kl": 0.0029959604144096375, "learning_rate": 1.8133333333333335e-06, "loss": 0.0001, "num_tokens": 1350909.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4561 }, { "clip_ratio/high_max": 0.02380952425301075, "clip_ratio/high_mean": 0.02380952425301075, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02380952425301075, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 84.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.815066814422607, "kl": 0.18687807023525238, "learning_rate": 1.8129999999999998e-06, "loss": -0.0142, "num_tokens": 1351211.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 84.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.10813266038894653, "kl": 0.019688505679368973, "learning_rate": 1.8126666666666666e-06, "loss": 0.001, "num_tokens": 1351507.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 84.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10175729542970657, "kl": 0.17746182531118393, "learning_rate": 1.8123333333333336e-06, "loss": 0.0089, "num_tokens": 1351815.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 84.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.17559285461902618, "kl": 0.028179899789392948, "learning_rate": 1.812e-06, "loss": 0.0014, "num_tokens": 1352074.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 84.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033068438060581684, "kl": 0.0017240876331925392, "learning_rate": 1.8116666666666668e-06, "loss": 0.0001, "num_tokens": 1352386.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 84.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.16999752819538116, "kl": 0.03488508611917496, "learning_rate": 1.8113333333333333e-06, "loss": 0.0016, "num_tokens": 1352721.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 84.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.002821570262312889, "kl": 0.2684878706932068, "learning_rate": 1.8110000000000001e-06, "loss": 0.0134, "num_tokens": 1353025.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 84.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.005127115175127983, "kl": 0.0008829649887047708, "learning_rate": 1.8106666666666667e-06, "loss": 0.0, "num_tokens": 1353337.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 84.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1310882568359375, "kl": 0.023206555051729083, "learning_rate": 1.8103333333333335e-06, "loss": 0.0011, "num_tokens": 1353660.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 84.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.460738182067871, "kl": 0.35734116565436125, "learning_rate": 1.8100000000000002e-06, "loss": 0.0924, "num_tokens": 1353970.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 84.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.7891405820846558, "kl": 0.08435942232608795, "learning_rate": 1.8096666666666666e-06, "loss": -0.0144, "num_tokens": 1354331.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 84.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03023175522685051, "kl": 0.0008117755642160773, "learning_rate": 1.8093333333333336e-06, "loss": 0.0001, "num_tokens": 1354547.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 84.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.16713611781597137, "kl": 0.023551173508167267, "learning_rate": 1.809e-06, "loss": 0.0014, "num_tokens": 1354849.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08143755048513412, "kl": 0.02762952505145222, "learning_rate": 1.8086666666666667e-06, "loss": 0.0014, "num_tokens": 1355137.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 84.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.07869987189769745, "kl": 0.002967948792502284, "learning_rate": 1.8083333333333333e-06, "loss": 0.0002, "num_tokens": 1355347.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04444926232099533, "kl": 0.0030876006931066513, "learning_rate": 1.808e-06, "loss": 0.0002, "num_tokens": 1355616.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 84.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0590357631444931, "kl": 0.007288108114153147, "learning_rate": 1.8076666666666667e-06, "loss": 0.0004, "num_tokens": 1355889.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 84.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.016484994441270828, "kl": 0.0035086802672594786, "learning_rate": 1.8073333333333334e-06, "loss": 0.0002, "num_tokens": 1356157.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 84.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.08064191788434982, "kl": 0.0109627153724432, "learning_rate": 1.8070000000000002e-06, "loss": 0.0006, "num_tokens": 1356427.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 84.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03658544272184372, "kl": 0.0006570935256604571, "learning_rate": 1.8066666666666666e-06, "loss": 0.0, "num_tokens": 1356683.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 84.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.18326403200626373, "kl": 0.027295667678117752, "learning_rate": 1.8063333333333336e-06, "loss": 0.0014, "num_tokens": 1356976.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.215245723724365, "kl": 0.04609328508377075, "learning_rate": 1.806e-06, "loss": 0.0343, "num_tokens": 1357288.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 84.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.9655568599700928, "kl": 0.07130121439695358, "learning_rate": 1.8056666666666667e-06, "loss": 0.1517, "num_tokens": 1357646.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 84.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787163764238358, "kl": 0.003359407768584788, "learning_rate": 1.8053333333333333e-06, "loss": 0.0002, "num_tokens": 1357966.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 84.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0718170776963234, "kl": 0.001834855880588293, "learning_rate": 1.805e-06, "loss": 0.0001, "num_tokens": 1358234.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.5, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 84.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 1.5237232446670532, "kl": 0.017707230523228645, "learning_rate": 1.8046666666666667e-06, "loss": 0.0478, "num_tokens": 1358656.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 84.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.016777342185378075, "kl": 0.0011444509727880359, "learning_rate": 1.8043333333333334e-06, "loss": 0.0001, "num_tokens": 1358916.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 84.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.729802370071411, "kl": 0.022776659578084946, "learning_rate": 1.8040000000000002e-06, "loss": 0.1147, "num_tokens": 1359245.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04702303186058998, "kl": 0.0009015277028083801, "learning_rate": 1.8036666666666668e-06, "loss": 0.0, "num_tokens": 1359458.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 85.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.036120835691690445, "kl": 0.007666812743991613, "learning_rate": 1.8033333333333336e-06, "loss": 0.0004, "num_tokens": 1359792.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 85.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09750744700431824, "kl": 0.0036141001619398594, "learning_rate": 1.803e-06, "loss": 0.0002, "num_tokens": 1360050.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 85.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.09926003962755203, "kl": 0.020237690769135952, "learning_rate": 1.8026666666666667e-06, "loss": 0.001, "num_tokens": 1360312.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 85.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.028317492455244064, "kl": 0.0011716534500010312, "learning_rate": 1.8023333333333333e-06, "loss": 0.0001, "num_tokens": 1360532.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 85.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.028795655816793442, "kl": 0.013575200457125902, "learning_rate": 1.802e-06, "loss": 0.0007, "num_tokens": 1360829.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 85.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027327844873070717, "kl": 0.2685043662786484, "learning_rate": 1.8016666666666666e-06, "loss": 0.0134, "num_tokens": 1361133.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.5779788494110107, "kl": 0.7720355167984962, "learning_rate": 1.8013333333333334e-06, "loss": 0.0974, "num_tokens": 1361370.0, "reward": 3.375, "reward_std": 1.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 1.25, "step": 4597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 85.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.14958490431308746, "kl": 0.030108027160167694, "learning_rate": 1.8010000000000002e-06, "loss": 0.0016, "num_tokens": 1361717.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 85.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.06679612398147583, "kl": 0.003531815833412111, "learning_rate": 1.8006666666666668e-06, "loss": 0.0002, "num_tokens": 1361951.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 85.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 1.7573078870773315, "kl": 0.06498152017593384, "learning_rate": 1.8003333333333336e-06, "loss": 0.0022, "num_tokens": 1362356.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 4600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 85.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03526627644896507, "kl": 0.014735556207597256, "learning_rate": 1.8e-06, "loss": 0.0007, "num_tokens": 1362660.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 85.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07709838449954987, "kl": 0.008507295278832316, "learning_rate": 1.7996666666666667e-06, "loss": 0.0004, "num_tokens": 1362958.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.11057507991790771, "kl": 0.014811470173299313, "learning_rate": 1.7993333333333333e-06, "loss": 0.0007, "num_tokens": 1363230.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 85.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.004880153574049473, "kl": 0.001476626261137426, "learning_rate": 1.799e-06, "loss": 0.0001, "num_tokens": 1363510.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.16540856659412384, "kl": 0.02626362256705761, "learning_rate": 1.7986666666666666e-06, "loss": 0.0013, "num_tokens": 1363782.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 85.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 4.5658769607543945, "kl": 0.05040191859006882, "learning_rate": 1.7983333333333334e-06, "loss": -0.0134, "num_tokens": 1364095.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.028984514996409416, "kl": 0.0024870901834219694, "learning_rate": 1.7980000000000002e-06, "loss": 0.0001, "num_tokens": 1364313.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 85.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.6176961660385132, "kl": 0.1045685987919569, "learning_rate": 1.7976666666666668e-06, "loss": -0.1292, "num_tokens": 1364650.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 4608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 85.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0178164504468441, "kl": 0.004536781634669751, "learning_rate": 1.7973333333333335e-06, "loss": 0.0002, "num_tokens": 1364908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 85.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06068643555045128, "kl": 0.03795307315886021, "learning_rate": 1.797e-06, "loss": 0.0019, "num_tokens": 1365212.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 85.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.007539695128798485, "kl": 0.0004823381605092436, "learning_rate": 1.7966666666666667e-06, "loss": 0.0, "num_tokens": 1365484.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 85.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.017650384455919266, "kl": 0.0019455926958471537, "learning_rate": 1.7963333333333333e-06, "loss": 0.0001, "num_tokens": 1365814.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.09376028180122375, "kl": 0.0026987403398379683, "learning_rate": 1.796e-06, "loss": 0.0001, "num_tokens": 1366062.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005154639016836882, "clip_ratio/low_min": 0.005154639016836882, "clip_ratio/region_mean": 0.005154639016836882, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 85.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.3227314949035645, "kl": 0.058972058817744255, "learning_rate": 1.7956666666666666e-06, "loss": -0.1058, "num_tokens": 1366417.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 4614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 85.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04207763075828552, "kl": 0.01446013996610418, "learning_rate": 1.7953333333333334e-06, "loss": 0.0008, "num_tokens": 1366704.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4615 }, { "clip_ratio/high_max": 0.007936508394777775, "clip_ratio/high_mean": 0.007936508394777775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007936508394777775, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 85.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.861692428588867, "kl": 0.11261475086212158, "learning_rate": 1.7950000000000002e-06, "loss": -0.0225, "num_tokens": 1367051.0, "reward": 5.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.34165620803833, "step": 4616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 85.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07999681681394577, "kl": 0.029599539004266262, "learning_rate": 1.7946666666666667e-06, "loss": 0.0015, "num_tokens": 1367407.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.057774126529693604, "kl": 0.0025075782323256135, "learning_rate": 1.7943333333333335e-06, "loss": 0.0001, "num_tokens": 1367703.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 85.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.12313896417617798, "kl": 0.006098371231928468, "learning_rate": 1.7939999999999999e-06, "loss": 0.0003, "num_tokens": 1368027.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02504068613052368, "kl": 0.002125752973370254, "learning_rate": 1.7936666666666669e-06, "loss": 0.0001, "num_tokens": 1368309.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 85.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.039043039083480835, "kl": 0.0010836496949195862, "learning_rate": 1.7933333333333332e-06, "loss": 0.0001, "num_tokens": 1368519.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 85.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05343858152627945, "kl": 0.0021771948086097836, "learning_rate": 1.793e-06, "loss": 0.0001, "num_tokens": 1368762.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 85.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.005909664090722799, "kl": 0.001699230633676052, "learning_rate": 1.7926666666666666e-06, "loss": 0.0001, "num_tokens": 1369074.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.018692174926400185, "kl": 0.005533390445634723, "learning_rate": 1.7923333333333334e-06, "loss": 0.0003, "num_tokens": 1369342.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 85.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02741626650094986, "kl": 0.0006975308060646057, "learning_rate": 1.7920000000000002e-06, "loss": 0.0, "num_tokens": 1369616.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 85.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 4.779372215270996, "kl": 0.02081705629825592, "learning_rate": 1.7916666666666667e-06, "loss": 0.0643, "num_tokens": 1369912.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 85.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.16113191843032837, "kl": 0.02319626696407795, "learning_rate": 1.7913333333333335e-06, "loss": 0.0011, "num_tokens": 1370235.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.011597000062465668, "kl": 0.00022526085376739502, "learning_rate": 1.7909999999999999e-06, "loss": 0.0, "num_tokens": 1370447.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 85.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.001522162463515997, "kl": 0.0004556626081466675, "learning_rate": 1.7906666666666669e-06, "loss": 0.0, "num_tokens": 1370707.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 85.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015761494869366288, "kl": 5.093216896057129e-05, "learning_rate": 1.7903333333333332e-06, "loss": 0.0, "num_tokens": 1370963.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 7.365943431854248, "kl": 0.07962773321196437, "learning_rate": 1.79e-06, "loss": 0.0019, "num_tokens": 1371235.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 85.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06607271730899811, "kl": 0.008443673374131322, "learning_rate": 1.7896666666666666e-06, "loss": 0.0004, "num_tokens": 1371528.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 85.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01242861244827509, "kl": 0.0035997406812384725, "learning_rate": 1.7893333333333334e-06, "loss": 0.0002, "num_tokens": 1371794.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 85.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.08627626299858093, "kl": 0.00473215157398954, "learning_rate": 1.7890000000000002e-06, "loss": 0.0002, "num_tokens": 1372113.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 85.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.07728192210197449, "kl": 0.16567887365818024, "learning_rate": 1.7886666666666667e-06, "loss": 0.0083, "num_tokens": 1372423.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 85.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009083964861929417, "kl": 0.00946786068379879, "learning_rate": 1.7883333333333335e-06, "loss": 0.0005, "num_tokens": 1372695.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 85.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.20461241900920868, "kl": 0.07090083882212639, "learning_rate": 1.7879999999999999e-06, "loss": 0.0036, "num_tokens": 1373029.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.7736951112747192, "kl": 0.04620979353785515, "learning_rate": 1.7876666666666669e-06, "loss": 0.0029, "num_tokens": 1373248.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 85.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09738536924123764, "kl": 0.13988986611366272, "learning_rate": 1.7873333333333332e-06, "loss": 0.007, "num_tokens": 1373620.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 85.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.2058975249528885, "kl": 0.01700449548661709, "learning_rate": 1.787e-06, "loss": 0.0009, "num_tokens": 1373950.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003152804565615952, "kl": 7.610023021697998e-05, "learning_rate": 1.7866666666666666e-06, "loss": 0.0, "num_tokens": 1374170.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 85.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.10333876311779022, "kl": 0.014639072585850954, "learning_rate": 1.7863333333333334e-06, "loss": 0.0007, "num_tokens": 1374458.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 85.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.045813076198101044, "kl": 0.003015394788235426, "learning_rate": 1.7860000000000001e-06, "loss": 0.0002, "num_tokens": 1374726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 86.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.121963731944561, "kl": 0.036465009674429893, "learning_rate": 1.7856666666666667e-06, "loss": 0.0018, "num_tokens": 1375066.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 86.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.015926910564303398, "kl": 0.0005370676517486572, "learning_rate": 1.7853333333333335e-06, "loss": 0.0, "num_tokens": 1375322.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 86.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03132804110646248, "kl": 0.0014229993685148656, "learning_rate": 1.785e-06, "loss": 0.0001, "num_tokens": 1375643.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 86.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05192486569285393, "kl": 0.0046487832441926, "learning_rate": 1.7846666666666668e-06, "loss": 0.0002, "num_tokens": 1375941.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 86.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.039292074739933014, "kl": 0.0027387288864701986, "learning_rate": 1.7843333333333332e-06, "loss": 0.0001, "num_tokens": 1376213.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 86.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05678660422563553, "kl": 0.03147179167717695, "learning_rate": 1.784e-06, "loss": 0.0016, "num_tokens": 1376567.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 86.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0512794628739357, "kl": 0.003787681460380554, "learning_rate": 1.7836666666666666e-06, "loss": 0.0002, "num_tokens": 1376856.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 86.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.036692336201667786, "kl": 0.003396979649551213, "learning_rate": 1.7833333333333333e-06, "loss": 0.0002, "num_tokens": 1377186.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 86.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01630033180117607, "kl": 0.0018120001768693328, "learning_rate": 1.7830000000000001e-06, "loss": 0.0001, "num_tokens": 1377454.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 86.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.08206231892108917, "kl": 0.04010830633342266, "learning_rate": 1.7826666666666667e-06, "loss": 0.002, "num_tokens": 1377756.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 86.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.11931747198104858, "kl": 0.0059440258191898465, "learning_rate": 1.7823333333333335e-06, "loss": 0.0004, "num_tokens": 1378023.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 86.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.3945906460285187, "kl": 0.03288387740030885, "learning_rate": 1.782e-06, "loss": 0.0017, "num_tokens": 1378318.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 86.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.3630080223083496, "kl": 0.061734676361083984, "learning_rate": 1.7816666666666668e-06, "loss": -0.0927, "num_tokens": 1378691.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 86.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 5.374413013458252, "kl": 0.07402043789625168, "learning_rate": 1.7813333333333332e-06, "loss": 0.0802, "num_tokens": 1378976.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 86.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.005743706598877907, "kl": 0.0003885440528392792, "learning_rate": 1.781e-06, "loss": 0.0, "num_tokens": 1379236.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 86.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.250213384628296, "kl": 0.08220694027841091, "learning_rate": 1.7806666666666665e-06, "loss": -0.019, "num_tokens": 1379536.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 86.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.036384690552949905, "kl": 0.0015540399472229183, "learning_rate": 1.7803333333333333e-06, "loss": 0.0001, "num_tokens": 1379779.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 86.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.386549949645996, "kl": 0.0676152752712369, "learning_rate": 1.7800000000000001e-06, "loss": 0.0169, "num_tokens": 1380088.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 86.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003157753963023424, "kl": 7.578730583190918e-05, "learning_rate": 1.7796666666666667e-06, "loss": 0.0, "num_tokens": 1380308.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 86.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.18242207169532776, "kl": 0.021608256734907627, "learning_rate": 1.7793333333333335e-06, "loss": 0.0011, "num_tokens": 1380639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 86.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.346588134765625, "kl": 0.09413415193557739, "learning_rate": 1.779e-06, "loss": 0.0493, "num_tokens": 1380993.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 86.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02952669747173786, "kl": 0.0024364853743463755, "learning_rate": 1.7786666666666668e-06, "loss": 0.0001, "num_tokens": 1381228.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 86.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09102169424295425, "kl": 0.011722934752469882, "learning_rate": 1.7783333333333332e-06, "loss": 0.0006, "num_tokens": 1381500.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 86.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06333321332931519, "kl": 0.003842551843263209, "learning_rate": 1.7780000000000002e-06, "loss": 0.0002, "num_tokens": 1381811.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 86.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.24839980900287628, "kl": 0.04897315055131912, "learning_rate": 1.7776666666666665e-06, "loss": 0.0025, "num_tokens": 1382087.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 86.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.18892042338848114, "kl": 0.025432278867810965, "learning_rate": 1.7773333333333333e-06, "loss": 0.0013, "num_tokens": 1382377.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 86.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06789866089820862, "kl": 0.019855202175676823, "learning_rate": 1.777e-06, "loss": 0.001, "num_tokens": 1382664.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 86.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03407621011137962, "kl": 0.05034089274704456, "learning_rate": 1.7766666666666667e-06, "loss": 0.0025, "num_tokens": 1383004.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 86.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09539736062288284, "kl": 0.0142113221809268, "learning_rate": 1.7763333333333335e-06, "loss": 0.0008, "num_tokens": 1383386.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 86.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01676495186984539, "kl": 0.002237536944448948, "learning_rate": 1.776e-06, "loss": 0.0001, "num_tokens": 1383698.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 86.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.005544695071876049, "kl": 0.01563091389834881, "learning_rate": 1.7756666666666668e-06, "loss": 0.0008, "num_tokens": 1383958.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 86.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.17210577428340912, "kl": 0.01150283170863986, "learning_rate": 1.7753333333333332e-06, "loss": 0.0006, "num_tokens": 1384285.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 86.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.010064753703773022, "kl": 0.0001001238779281266, "learning_rate": 1.7750000000000002e-06, "loss": 0.0, "num_tokens": 1384541.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 86.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.10454939305782318, "kl": 0.0301426500082016, "learning_rate": 1.7746666666666665e-06, "loss": 0.0015, "num_tokens": 1384869.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 86.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.01666002906858921, "kl": 0.0014522984565701336, "learning_rate": 1.7743333333333333e-06, "loss": 0.0001, "num_tokens": 1385088.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 86.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.044514428824186325, "kl": 0.0018117026193067431, "learning_rate": 1.774e-06, "loss": 0.0001, "num_tokens": 1385372.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 86.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.018677186220884323, "kl": 0.005287077045068145, "learning_rate": 1.7736666666666667e-06, "loss": 0.0003, "num_tokens": 1385640.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 86.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.051670122891664505, "kl": 0.014468851499259472, "learning_rate": 1.7733333333333334e-06, "loss": 0.0007, "num_tokens": 1386003.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 86.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0088161900639534, "kl": 0.009596351999789476, "learning_rate": 1.773e-06, "loss": 0.0005, "num_tokens": 1386275.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 86.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.008560660295188427, "kl": 0.00012165521184215322, "learning_rate": 1.7726666666666668e-06, "loss": 0.0, "num_tokens": 1386555.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 86.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.004640623927116394, "kl": 0.0014469127054326236, "learning_rate": 1.7723333333333331e-06, "loss": 0.0001, "num_tokens": 1386771.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 86.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.06340131163597107, "kl": 0.009800476138480008, "learning_rate": 1.7720000000000001e-06, "loss": 0.0005, "num_tokens": 1387069.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 86.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.437071323394775, "kl": 0.00455561134731397, "learning_rate": 1.7716666666666665e-06, "loss": 0.0831, "num_tokens": 1387358.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 86.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03959748148918152, "kl": 0.07488290406763554, "learning_rate": 1.7713333333333333e-06, "loss": 0.0038, "num_tokens": 1387729.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 86.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.10948716849088669, "kl": 0.024760215543210506, "learning_rate": 1.771e-06, "loss": 0.0013, "num_tokens": 1388026.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 86.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.1482488363981247, "kl": 0.020993283949792385, "learning_rate": 1.7706666666666666e-06, "loss": 0.001, "num_tokens": 1388303.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 86.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009246274130418897, "kl": 0.0037707313895225525, "learning_rate": 1.7703333333333334e-06, "loss": 0.0002, "num_tokens": 1388539.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 86.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03788057342171669, "kl": 0.041045090183615685, "learning_rate": 1.77e-06, "loss": 0.002, "num_tokens": 1388952.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 86.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.007056123577058315, "kl": 0.0044207972241565585, "learning_rate": 1.7696666666666668e-06, "loss": 0.0002, "num_tokens": 1389210.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 86.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.009064299054443836, "kl": 0.0008456170617137104, "learning_rate": 1.7693333333333333e-06, "loss": 0.0, "num_tokens": 1389470.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 86.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.023429162800312042, "kl": 0.00045727938413619995, "learning_rate": 1.7690000000000001e-06, "loss": 0.0, "num_tokens": 1389680.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 86.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.21100997924804688, "kl": 0.05531185492873192, "learning_rate": 1.768666666666667e-06, "loss": 0.0028, "num_tokens": 1390021.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 86.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.025573842227458954, "kl": 0.0002676844596862793, "learning_rate": 1.7683333333333333e-06, "loss": 0.0, "num_tokens": 1390234.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 86.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04619613289833069, "kl": 0.011511936318129301, "learning_rate": 1.7680000000000003e-06, "loss": 0.0005, "num_tokens": 1390566.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 87.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.00197571306489408, "kl": 0.26865261793136597, "learning_rate": 1.7676666666666666e-06, "loss": 0.0134, "num_tokens": 1390870.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 87.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.18719759583473206, "kl": 0.041125768795609474, "learning_rate": 1.7673333333333334e-06, "loss": 0.0021, "num_tokens": 1391196.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 87.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07855581492185593, "kl": 0.01782261545304209, "learning_rate": 1.767e-06, "loss": 0.0009, "num_tokens": 1391527.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 87.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06112566962838173, "kl": 0.0025870645185932517, "learning_rate": 1.7666666666666668e-06, "loss": 0.0001, "num_tokens": 1391825.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 87.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.14201121032238007, "kl": 0.055685851722955704, "learning_rate": 1.7663333333333333e-06, "loss": 0.0027, "num_tokens": 1392242.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.024855894967913628, "kl": 0.010527586564421654, "learning_rate": 1.7660000000000001e-06, "loss": 0.0005, "num_tokens": 1392514.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 87.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 12.536376953125, "kl": 0.004537135362625122, "learning_rate": 1.765666666666667e-06, "loss": -0.0005, "num_tokens": 1392774.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 87.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020892838947474957, "kl": 0.2686363011598587, "learning_rate": 1.7653333333333333e-06, "loss": 0.0134, "num_tokens": 1393078.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 87.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.15940794348716736, "kl": 0.03647000528872013, "learning_rate": 1.7650000000000003e-06, "loss": 0.0019, "num_tokens": 1393420.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 87.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.24599826335906982, "kl": 0.014139835315290838, "learning_rate": 1.7646666666666666e-06, "loss": 0.0007, "num_tokens": 1393663.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 87.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.009678980335593224, "kl": 0.0008582860173191875, "learning_rate": 1.7643333333333334e-06, "loss": 0.0, "num_tokens": 1393923.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 87.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0481424406170845, "kl": 0.0011678288865368813, "learning_rate": 1.764e-06, "loss": 0.0, "num_tokens": 1394143.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 87.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0892491266131401, "kl": 0.017186392098665237, "learning_rate": 1.7636666666666667e-06, "loss": 0.0009, "num_tokens": 1394465.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 87.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.09247300773859024, "kl": 0.02190759778022766, "learning_rate": 1.7633333333333333e-06, "loss": 0.0012, "num_tokens": 1394743.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 87.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0053375172428786755, "kl": 0.0009459902939852327, "learning_rate": 1.763e-06, "loss": 0.0, "num_tokens": 1395055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 87.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.13784445822238922, "kl": 0.025329535827040672, "learning_rate": 1.7626666666666669e-06, "loss": 0.0013, "num_tokens": 1395358.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 87.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.031100338324904442, "kl": 0.004801976203452796, "learning_rate": 1.7623333333333335e-06, "loss": 0.0002, "num_tokens": 1395616.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 87.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.11222835630178452, "kl": 0.004167563864029944, "learning_rate": 1.7620000000000002e-06, "loss": 0.0002, "num_tokens": 1395849.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 87.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009320286917500198, "kl": 0.003768324851989746, "learning_rate": 1.7616666666666666e-06, "loss": 0.0002, "num_tokens": 1396085.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 87.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.992314100265503, "kl": 0.07456966117024422, "learning_rate": 1.7613333333333334e-06, "loss": 0.0238, "num_tokens": 1396448.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 87.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.014076169580221176, "kl": 0.0014486652798950672, "learning_rate": 1.761e-06, "loss": 0.0001, "num_tokens": 1396776.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 87.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01541076973080635, "kl": 0.0023488476872444153, "learning_rate": 1.7606666666666667e-06, "loss": 0.0001, "num_tokens": 1397088.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07820127159357071, "kl": 0.0077334127854555845, "learning_rate": 1.7603333333333333e-06, "loss": 0.0004, "num_tokens": 1397388.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 87.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03230593726038933, "kl": 0.001247149455593899, "learning_rate": 1.76e-06, "loss": 0.0001, "num_tokens": 1397645.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 87.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07705104351043701, "kl": 0.0156064429320395, "learning_rate": 1.7596666666666669e-06, "loss": 0.0008, "num_tokens": 1397923.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 87.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.8879602551460266, "kl": 0.1230792049318552, "learning_rate": 1.7593333333333334e-06, "loss": 0.0061, "num_tokens": 1398221.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.75, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 73.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 87.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.8353389501571655, "kl": 0.09082326479256153, "learning_rate": 1.7590000000000002e-06, "loss": 0.332, "num_tokens": 1398736.0, "reward": 5.550000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 5.550000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 4724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 87.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.14337654411792755, "kl": 0.02798518445342779, "learning_rate": 1.7586666666666666e-06, "loss": 0.0014, "num_tokens": 1399118.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 87.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03855842724442482, "kl": 0.0004110857844352722, "learning_rate": 1.7583333333333334e-06, "loss": 0.0, "num_tokens": 1399330.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 87.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.006476222071796656, "kl": 0.0005181431770324707, "learning_rate": 1.758e-06, "loss": 0.0, "num_tokens": 1399590.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 87.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.060850590467453, "kl": 0.0025919199688360095, "learning_rate": 1.7576666666666667e-06, "loss": 0.0001, "num_tokens": 1399918.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.34920141100883484, "kl": 0.03598490194417536, "learning_rate": 1.7573333333333333e-06, "loss": 0.002, "num_tokens": 1400213.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 87.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.005956654902547598, "kl": 0.015529973432421684, "learning_rate": 1.757e-06, "loss": 0.0008, "num_tokens": 1400473.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 87.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025009100791066885, "kl": 0.003056600457057357, "learning_rate": 1.7566666666666669e-06, "loss": 0.0001, "num_tokens": 1400741.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 87.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.42736586928367615, "kl": 0.08293714374303818, "learning_rate": 1.7563333333333334e-06, "loss": 0.0037, "num_tokens": 1401071.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 87.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0065531861037015915, "kl": 0.0003086984215769917, "learning_rate": 1.7560000000000002e-06, "loss": 0.0, "num_tokens": 1401291.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 87.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.049356941133737564, "kl": 0.16276530921459198, "learning_rate": 1.7556666666666666e-06, "loss": 0.0081, "num_tokens": 1401601.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 87.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.47890207171440125, "kl": 0.018510058522224426, "learning_rate": 1.7553333333333334e-06, "loss": 0.0009, "num_tokens": 1401817.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.07846085727214813, "kl": 0.02969266881700605, "learning_rate": 1.755e-06, "loss": 0.0015, "num_tokens": 1402105.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 87.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01720520481467247, "kl": 0.06209162622690201, "learning_rate": 1.7546666666666667e-06, "loss": 0.0031, "num_tokens": 1402437.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 87.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.033414989709854126, "kl": 0.0066407660488039255, "learning_rate": 1.7543333333333333e-06, "loss": 0.0003, "num_tokens": 1402769.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 87.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.027305880561470985, "kl": 0.0009377330643474124, "learning_rate": 1.754e-06, "loss": 0.0, "num_tokens": 1403039.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 6.2385735511779785, "kl": 0.030476846266537905, "learning_rate": 1.7536666666666668e-06, "loss": 0.209, "num_tokens": 1403341.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 87.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 4.8092451095581055, "kl": 0.2841898649930954, "learning_rate": 1.7533333333333334e-06, "loss": -0.119, "num_tokens": 1403697.0, "reward": 6.625, "reward_std": 2.428133726119995, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.428133726119995, "step": 4741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 87.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 5.387889862060547, "kl": 0.1311328737065196, "learning_rate": 1.7530000000000002e-06, "loss": 0.0091, "num_tokens": 1403971.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 87.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.023371785879135132, "kl": 0.005181870190426707, "learning_rate": 1.7526666666666666e-06, "loss": 0.0003, "num_tokens": 1404259.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 87.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0339960977435112, "kl": 0.005621553864330053, "learning_rate": 1.7523333333333336e-06, "loss": 0.0003, "num_tokens": 1404550.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 87.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002939185651484877, "kl": 8.12336802482605e-05, "learning_rate": 1.752e-06, "loss": 0.0, "num_tokens": 1404770.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 87.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010580105008557439, "kl": 4.505366086959839e-05, "learning_rate": 1.7516666666666667e-06, "loss": 0.0, "num_tokens": 1404982.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 87.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.2652187943458557, "kl": 0.0241744231316261, "learning_rate": 1.7513333333333333e-06, "loss": 0.0011, "num_tokens": 1405244.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 87.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.6330318450927734, "kl": 0.8132757358253002, "learning_rate": 1.751e-06, "loss": 0.0205, "num_tokens": 1405612.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.1833159178495407, "kl": 0.03436026722192764, "learning_rate": 1.7506666666666668e-06, "loss": 0.0017, "num_tokens": 1405884.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 87.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12061919271945953, "kl": 0.0028928687097504735, "learning_rate": 1.7503333333333334e-06, "loss": 0.0001, "num_tokens": 1406156.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 87.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03356965258717537, "kl": 0.0042511168867349625, "learning_rate": 1.7500000000000002e-06, "loss": 0.0002, "num_tokens": 1406433.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 88.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.9887139797210693, "kl": 0.019847046583890915, "learning_rate": 1.7496666666666665e-06, "loss": -0.0394, "num_tokens": 1406739.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4752 }, { "clip_ratio/high_max": 0.01515151560306549, "clip_ratio/high_mean": 0.01515151560306549, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01515151560306549, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 88.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 9.949356079101562, "kl": 0.024055887013673782, "learning_rate": 1.7493333333333335e-06, "loss": 0.1458, "num_tokens": 1407020.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 88.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.031621236354112625, "kl": 0.0035676882253028452, "learning_rate": 1.749e-06, "loss": 0.0002, "num_tokens": 1407315.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 88.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.012054701335728168, "kl": 0.0038182729622349143, "learning_rate": 1.7486666666666667e-06, "loss": 0.0002, "num_tokens": 1407605.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 88.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.02661200799047947, "kl": 0.0019001525943167508, "learning_rate": 1.7483333333333333e-06, "loss": 0.0001, "num_tokens": 1407931.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 88.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.040140535682439804, "kl": 0.002985157072544098, "learning_rate": 1.748e-06, "loss": 0.0001, "num_tokens": 1408213.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 88.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.893159866333008, "kl": 0.17455535382032394, "learning_rate": 1.7476666666666668e-06, "loss": 0.0607, "num_tokens": 1408565.0, "reward": 4.25, "reward_std": 4.27200174331665, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 4.27200174331665, "step": 4758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 88.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.9771177768707275, "kl": 0.10692603280767798, "learning_rate": 1.7473333333333334e-06, "loss": -0.0245, "num_tokens": 1408849.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 88.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 2.2160401344299316, "kl": 0.20149564469465986, "learning_rate": 1.7470000000000002e-06, "loss": 0.0125, "num_tokens": 1409124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 88.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.10250312089920044, "kl": 0.00881947239395231, "learning_rate": 1.7466666666666665e-06, "loss": 0.0004, "num_tokens": 1409384.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 88.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.09445659071207047, "kl": 0.04594658687710762, "learning_rate": 1.7463333333333335e-06, "loss": 0.0023, "num_tokens": 1409684.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 88.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.015238243155181408, "kl": 0.0001875132293207571, "learning_rate": 1.7459999999999999e-06, "loss": 0.0, "num_tokens": 1409940.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 88.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07152358442544937, "kl": 0.024334699905011803, "learning_rate": 1.7456666666666667e-06, "loss": 0.0012, "num_tokens": 1410228.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 88.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.09340780973434448, "kl": 0.03230349626392126, "learning_rate": 1.7453333333333332e-06, "loss": 0.0013, "num_tokens": 1410613.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 88.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.12714384496212006, "kl": 0.11031162738800049, "learning_rate": 1.745e-06, "loss": 0.0055, "num_tokens": 1410985.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 88.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07792849838733673, "kl": 0.011701170820742846, "learning_rate": 1.7446666666666668e-06, "loss": 0.0006, "num_tokens": 1411315.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 88.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.06055062264204025, "kl": 0.0009274661424569786, "learning_rate": 1.7443333333333334e-06, "loss": 0.0, "num_tokens": 1411585.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 88.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.668433904647827, "kl": 0.05290278419852257, "learning_rate": 1.7440000000000002e-06, "loss": -0.0847, "num_tokens": 1411940.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 88.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009742482216097414, "kl": 0.003757782280445099, "learning_rate": 1.7436666666666667e-06, "loss": 0.0002, "num_tokens": 1412176.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 88.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.007873695343732834, "kl": 0.0016839823802001774, "learning_rate": 1.7433333333333335e-06, "loss": 0.0001, "num_tokens": 1412456.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 88.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.004643014166504145, "kl": 0.00017097840463975444, "learning_rate": 1.7429999999999999e-06, "loss": 0.0, "num_tokens": 1412728.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 88.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.1629791259765625, "kl": 0.22995993122458458, "learning_rate": 1.7426666666666667e-06, "loss": -0.0238, "num_tokens": 1413128.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 4773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 88.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0317719392478466, "kl": 0.001265214232262224, "learning_rate": 1.7423333333333332e-06, "loss": 0.0001, "num_tokens": 1413400.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 88.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.008855005726218224, "kl": 0.009439130313694477, "learning_rate": 1.742e-06, "loss": 0.0005, "num_tokens": 1413672.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 88.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03459019213914871, "kl": 0.0020393177692312747, "learning_rate": 1.7416666666666668e-06, "loss": 0.0001, "num_tokens": 1413891.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4776 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.014705882407724857, "clip_ratio/low_mean": 0.014705882407724857, "clip_ratio/low_min": 0.014705882407724857, "clip_ratio/region_mean": 0.029411764815449715, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 88.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.4239726066589355, "kl": 0.06648285128176212, "learning_rate": 1.7413333333333334e-06, "loss": -0.1482, "num_tokens": 1414189.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 88.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06055663526058197, "kl": 0.07100075110793114, "learning_rate": 1.7410000000000001e-06, "loss": 0.0035, "num_tokens": 1414574.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 88.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.00029829610139131546, "kl": 7.943809032440186e-05, "learning_rate": 1.7406666666666667e-06, "loss": 0.0, "num_tokens": 1414794.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 88.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.006371496245265007, "kl": 0.00045894537470303476, "learning_rate": 1.7403333333333335e-06, "loss": 0.0, "num_tokens": 1415115.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 88.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.11365672200918198, "kl": 0.0212111659348011, "learning_rate": 1.7399999999999999e-06, "loss": 0.0011, "num_tokens": 1415412.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 88.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.005644958931952715, "kl": 0.000581253319978714, "learning_rate": 1.7396666666666666e-06, "loss": 0.0, "num_tokens": 1415672.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 88.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04187702015042305, "kl": 0.004522688686847687, "learning_rate": 1.7393333333333332e-06, "loss": 0.0002, "num_tokens": 1416002.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 88.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009981738403439522, "kl": 0.00011271610856056213, "learning_rate": 1.739e-06, "loss": 0.0, "num_tokens": 1416246.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 88.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.015374717302620411, "kl": 0.0005145706236362457, "learning_rate": 1.7386666666666668e-06, "loss": 0.0, "num_tokens": 1416508.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 88.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.16972176730632782, "kl": 0.018287737853825092, "learning_rate": 1.7383333333333333e-06, "loss": 0.0012, "num_tokens": 1416731.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 88.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02892535738646984, "kl": 0.000577746395720169, "learning_rate": 1.7380000000000001e-06, "loss": 0.0, "num_tokens": 1416964.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 88.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.023547876626253128, "kl": 0.0006417706608772278, "learning_rate": 1.7376666666666667e-06, "loss": 0.0, "num_tokens": 1417174.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 88.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07347511500120163, "kl": 0.008679481688886881, "learning_rate": 1.7373333333333335e-06, "loss": 0.0004, "num_tokens": 1417462.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 88.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0788622796535492, "kl": 0.004397721262648702, "learning_rate": 1.7369999999999998e-06, "loss": 0.0002, "num_tokens": 1417758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 88.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08594287186861038, "kl": 0.014041016809642315, "learning_rate": 1.7366666666666668e-06, "loss": 0.0007, "num_tokens": 1418087.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 88.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.09737042337656021, "kl": 0.015641923062503338, "learning_rate": 1.7363333333333332e-06, "loss": 0.0008, "num_tokens": 1418395.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 88.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.17543596029281616, "kl": 0.036995720118284225, "learning_rate": 1.736e-06, "loss": 0.0019, "num_tokens": 1418732.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 88.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032571146730333567, "kl": 0.2684212028980255, "learning_rate": 1.7356666666666668e-06, "loss": 0.0134, "num_tokens": 1419036.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 88.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03074190579354763, "kl": 0.000304393470287323, "learning_rate": 1.7353333333333333e-06, "loss": 0.0, "num_tokens": 1419248.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 88.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 4.21857213973999, "kl": 0.030106719117611647, "learning_rate": 1.7350000000000001e-06, "loss": -0.0121, "num_tokens": 1419565.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 88.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.06326611340045929, "kl": 0.009333632420748472, "learning_rate": 1.7346666666666667e-06, "loss": 0.0005, "num_tokens": 1419839.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 88.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.14643755555152893, "kl": 0.052632153034210205, "learning_rate": 1.7343333333333335e-06, "loss": 0.0026, "num_tokens": 1420111.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 88.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.514625549316406, "kl": 0.03136721812188625, "learning_rate": 1.7339999999999998e-06, "loss": 0.1059, "num_tokens": 1420469.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 4799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 88.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.026346635073423386, "kl": 0.0018991194665431976, "learning_rate": 1.7336666666666668e-06, "loss": 0.0001, "num_tokens": 1420781.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 88.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01011976134032011, "kl": 0.16344992071390152, "learning_rate": 1.7333333333333332e-06, "loss": 0.0082, "num_tokens": 1421089.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 88.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04138335585594177, "kl": 0.003717821091413498, "learning_rate": 1.733e-06, "loss": 0.0002, "num_tokens": 1421411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 88.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09094447642564774, "kl": 0.021962124854326248, "learning_rate": 1.7326666666666667e-06, "loss": 0.0012, "num_tokens": 1421688.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 88.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.13578258454799652, "kl": 0.019133458845317364, "learning_rate": 1.7323333333333333e-06, "loss": 0.001, "num_tokens": 1421951.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 88.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04140186682343483, "kl": 0.02233363315463066, "learning_rate": 1.732e-06, "loss": 0.0011, "num_tokens": 1422327.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 89.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.07996660470962524, "kl": 0.001408204436302185, "learning_rate": 1.7316666666666667e-06, "loss": 0.0001, "num_tokens": 1422547.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 89.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 7.286879062652588, "kl": 0.015015662327641621, "learning_rate": 1.7313333333333335e-06, "loss": 0.0889, "num_tokens": 1422824.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 4807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 89.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022143719252198935, "kl": 0.0004230029881000519, "learning_rate": 1.7309999999999998e-06, "loss": 0.0, "num_tokens": 1423084.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 89.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.12269677966833115, "kl": 0.03959126025438309, "learning_rate": 1.7306666666666668e-06, "loss": 0.002, "num_tokens": 1423467.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 89.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.016634922474622726, "kl": 0.023318459207075648, "learning_rate": 1.7303333333333332e-06, "loss": 0.0012, "num_tokens": 1423756.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 89.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.02401517890393734, "kl": 0.0019014648860320449, "learning_rate": 1.73e-06, "loss": 0.0001, "num_tokens": 1424036.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 89.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06616130471229553, "kl": 0.015449159778654575, "learning_rate": 1.7296666666666667e-06, "loss": 0.0008, "num_tokens": 1424340.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 89.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06612984836101532, "kl": 0.03784245811402798, "learning_rate": 1.7293333333333333e-06, "loss": 0.0019, "num_tokens": 1424642.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 89.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.013312330469489098, "kl": 0.018647802527993917, "learning_rate": 1.729e-06, "loss": 0.0009, "num_tokens": 1424920.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 89.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.3609471321105957, "kl": 0.26293565332889557, "learning_rate": 1.7286666666666667e-06, "loss": 0.055, "num_tokens": 1425270.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 4815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 89.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033325408585369587, "kl": 0.26840245723724365, "learning_rate": 1.7283333333333334e-06, "loss": 0.0134, "num_tokens": 1425574.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 89.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011171288788318634, "kl": 0.00010593980550765991, "learning_rate": 1.728e-06, "loss": 0.0, "num_tokens": 1425818.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 89.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.012760492041707039, "kl": 0.05843185447156429, "learning_rate": 1.7276666666666668e-06, "loss": 0.0029, "num_tokens": 1426150.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 89.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.010549293830990791, "kl": 0.014556619804352522, "learning_rate": 1.7273333333333336e-06, "loss": 0.0007, "num_tokens": 1426410.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 89.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.006148915272206068, "kl": 0.00037801267171744257, "learning_rate": 1.727e-06, "loss": 0.0, "num_tokens": 1426670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 89.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.050276126712560654, "kl": 0.0018282572855241597, "learning_rate": 1.726666666666667e-06, "loss": 0.0001, "num_tokens": 1426937.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 89.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0307657178491354, "kl": 0.00029872357845306396, "learning_rate": 1.7263333333333333e-06, "loss": 0.0, "num_tokens": 1427149.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 89.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.00695132277905941, "kl": 0.00042761834629345685, "learning_rate": 1.726e-06, "loss": 0.0, "num_tokens": 1427467.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 89.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.029064415022730827, "kl": 0.003741058288142085, "learning_rate": 1.7256666666666666e-06, "loss": 0.0002, "num_tokens": 1427796.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 89.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.9398007392883301, "kl": 0.09074857085943222, "learning_rate": 1.7253333333333334e-06, "loss": 0.0045, "num_tokens": 1428016.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 89.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0314173623919487, "kl": 0.002311806194484234, "learning_rate": 1.725e-06, "loss": 0.0001, "num_tokens": 1428300.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 89.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03163344785571098, "kl": 0.0005777254700660706, "learning_rate": 1.7246666666666668e-06, "loss": 0.0, "num_tokens": 1428506.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 89.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005189788527786732, "kl": 0.0004135353665333241, "learning_rate": 1.7243333333333336e-06, "loss": 0.0, "num_tokens": 1428818.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 89.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.7829396724700928, "kl": 0.062060149386525154, "learning_rate": 1.724e-06, "loss": 0.005, "num_tokens": 1429106.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 89.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.7605483531951904, "kl": 0.107602808624506, "learning_rate": 1.723666666666667e-06, "loss": 0.0065, "num_tokens": 1429477.0, "reward": 5.625, "reward_std": 2.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 2.75, "step": 4830 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.01315789483487606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01315789483487606, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 89.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 9.245172500610352, "kl": 0.05810554325580597, "learning_rate": 1.7233333333333333e-06, "loss": 0.137, "num_tokens": 1429762.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 89.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01007368229329586, "kl": 0.1634574607014656, "learning_rate": 1.723e-06, "loss": 0.0082, "num_tokens": 1430070.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 89.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05980310216546059, "kl": 0.0070490543730556965, "learning_rate": 1.7226666666666666e-06, "loss": 0.0004, "num_tokens": 1430364.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 89.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.006359180435538292, "kl": 0.00012331008838373236, "learning_rate": 1.7223333333333334e-06, "loss": 0.0, "num_tokens": 1430620.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 89.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.026532836258411407, "kl": 0.0018375739455223083, "learning_rate": 1.722e-06, "loss": 0.0001, "num_tokens": 1430932.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 89.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.027995651587843895, "kl": 0.004782599047757685, "learning_rate": 1.7216666666666668e-06, "loss": 0.0002, "num_tokens": 1431227.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 89.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.051071785390377045, "kl": 0.008735061157494783, "learning_rate": 1.7213333333333336e-06, "loss": 0.0004, "num_tokens": 1431500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 89.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.852427959442139, "kl": 0.016891013365238905, "learning_rate": 1.721e-06, "loss": 0.1198, "num_tokens": 1431783.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 89.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.028023596853017807, "kl": 0.007031596032902598, "learning_rate": 1.720666666666667e-06, "loss": 0.0004, "num_tokens": 1432076.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 89.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.31240132451057434, "kl": 0.01999014150351286, "learning_rate": 1.7203333333333333e-06, "loss": 0.001, "num_tokens": 1432374.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 89.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09493028372526169, "kl": 0.06407080590724945, "learning_rate": 1.72e-06, "loss": 0.0033, "num_tokens": 1432793.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 89.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.007937569171190262, "kl": 0.0008932113996706903, "learning_rate": 1.7196666666666666e-06, "loss": 0.0, "num_tokens": 1433013.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 89.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010140584781765938, "kl": 0.00374448299407959, "learning_rate": 1.7193333333333334e-06, "loss": 0.0002, "num_tokens": 1433249.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 89.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 6.036374092102051, "kl": 0.08073539473116398, "learning_rate": 1.719e-06, "loss": 0.1016, "num_tokens": 1433508.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 89.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02493441291153431, "kl": 0.0010606646537780762, "learning_rate": 1.7186666666666668e-06, "loss": 0.0001, "num_tokens": 1433720.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 89.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.17476995289325714, "kl": 0.018154183868318796, "learning_rate": 1.7183333333333335e-06, "loss": 0.0009, "num_tokens": 1434010.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 89.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.09208376705646515, "kl": 0.006146706640720367, "learning_rate": 1.7180000000000001e-06, "loss": 0.0003, "num_tokens": 1434226.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 89.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.10774128139019012, "kl": 0.019184167496860027, "learning_rate": 1.7176666666666669e-06, "loss": 0.001, "num_tokens": 1434559.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 89.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01987144537270069, "kl": 0.0017949468601727858, "learning_rate": 1.7173333333333333e-06, "loss": 0.0001, "num_tokens": 1434829.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011904762126505375, "clip_ratio/low_min": 0.011904762126505375, "clip_ratio/region_mean": 0.011904762126505375, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 89.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 6.479868412017822, "kl": 0.2453064126893878, "learning_rate": 1.717e-06, "loss": 0.2103, "num_tokens": 1435117.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 4850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 89.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.10015010088682175, "kl": 0.003540456644259393, "learning_rate": 1.7166666666666666e-06, "loss": 0.0002, "num_tokens": 1435351.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 89.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.6972345113754272, "kl": 0.04174995399080217, "learning_rate": 1.7163333333333334e-06, "loss": 0.0021, "num_tokens": 1435680.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 89.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07457930594682693, "kl": 0.01834350824356079, "learning_rate": 1.716e-06, "loss": 0.0009, "num_tokens": 1436002.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 89.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.17040890455245972, "kl": 0.020668487064540386, "learning_rate": 1.7156666666666667e-06, "loss": 0.0011, "num_tokens": 1436346.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 89.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.0210583209991455, "kl": 0.10992814600467682, "learning_rate": 1.7153333333333335e-06, "loss": 0.0474, "num_tokens": 1436679.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 89.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.18218640983104706, "kl": 0.011712106177583337, "learning_rate": 1.715e-06, "loss": 0.0006, "num_tokens": 1436983.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 89.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 7.696809768676758, "kl": 0.05960363708436489, "learning_rate": 1.7146666666666669e-06, "loss": -0.02, "num_tokens": 1437264.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 89.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.515857994556427, "kl": 0.0425142552703619, "learning_rate": 1.7143333333333332e-06, "loss": 0.0022, "num_tokens": 1437584.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 89.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.10030697286128998, "kl": 0.026347876526415348, "learning_rate": 1.714e-06, "loss": 0.0015, "num_tokens": 1437888.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 90.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.415035367012024, "kl": 0.12497110664844513, "learning_rate": 1.7136666666666666e-06, "loss": -0.0147, "num_tokens": 1438258.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 90.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.38684168457984924, "kl": 0.026495108380913734, "learning_rate": 1.7133333333333334e-06, "loss": 0.0013, "num_tokens": 1438529.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 90.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00693944375962019, "kl": 0.001373795501422137, "learning_rate": 1.713e-06, "loss": 0.0001, "num_tokens": 1438806.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 90.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.054273296147584915, "kl": 0.010294564999639988, "learning_rate": 1.7126666666666667e-06, "loss": 0.0005, "num_tokens": 1439126.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 90.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05258876457810402, "kl": 0.01519824587740004, "learning_rate": 1.7123333333333335e-06, "loss": 0.0007, "num_tokens": 1439448.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014285714365541935, "clip_ratio/low_min": 0.014285714365541935, "clip_ratio/region_mean": 0.014285714365541935, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 90.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.7553659677505493, "kl": 0.07879587262868881, "learning_rate": 1.712e-06, "loss": 0.0109, "num_tokens": 1439749.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 90.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.9674148559570312, "kl": 0.11636403948068619, "learning_rate": 1.7116666666666669e-06, "loss": 0.0057, "num_tokens": 1440121.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 90.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10132623463869095, "kl": 0.019033951684832573, "learning_rate": 1.7113333333333332e-06, "loss": 0.001, "num_tokens": 1440464.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 90.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03728455305099487, "kl": 0.0023740706965327263, "learning_rate": 1.7110000000000002e-06, "loss": 0.0001, "num_tokens": 1440699.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.025413207709789276, "kl": 0.004943513544276357, "learning_rate": 1.7106666666666666e-06, "loss": 0.0002, "num_tokens": 1440988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 90.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 19.713300704956055, "kl": 0.07313615083694458, "learning_rate": 1.7103333333333334e-06, "loss": 0.1707, "num_tokens": 1441202.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 90.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.07446187734603882, "kl": 0.013546426314860582, "learning_rate": 1.71e-06, "loss": 0.0007, "num_tokens": 1441536.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 90.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.008628357201814651, "kl": 0.0004967711865901947, "learning_rate": 1.7096666666666667e-06, "loss": 0.0, "num_tokens": 1441796.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 90.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 3.7763540744781494, "kl": 0.021116905263625085, "learning_rate": 1.7093333333333335e-06, "loss": 0.0005, "num_tokens": 1442122.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 90.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.00874437764286995, "kl": 0.014907174278050661, "learning_rate": 1.709e-06, "loss": 0.0007, "num_tokens": 1442382.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 90.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.2355698198080063, "kl": 0.02511242777109146, "learning_rate": 1.7086666666666669e-06, "loss": 0.0013, "num_tokens": 1442642.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 90.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 5.532217979431152, "kl": 0.02727799816057086, "learning_rate": 1.7083333333333332e-06, "loss": 0.1836, "num_tokens": 1442926.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 90.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.022559387609362602, "kl": 0.04269120469689369, "learning_rate": 1.7080000000000002e-06, "loss": 0.0021, "num_tokens": 1443330.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 90.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 4.765388011932373, "kl": 0.0718580037355423, "learning_rate": 1.7076666666666666e-06, "loss": 0.1521, "num_tokens": 1443656.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 90.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04293379187583923, "kl": 0.027146708220243454, "learning_rate": 1.7073333333333333e-06, "loss": 0.0014, "num_tokens": 1444045.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 90.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.010079680010676384, "kl": 0.00021346905850805342, "learning_rate": 1.707e-06, "loss": 0.0, "num_tokens": 1444315.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 90.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05477239191532135, "kl": 0.00454887276282534, "learning_rate": 1.7066666666666667e-06, "loss": 0.0002, "num_tokens": 1444583.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 90.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01082111056894064, "kl": 0.16325707733631134, "learning_rate": 1.7063333333333335e-06, "loss": 0.0082, "num_tokens": 1444891.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 90.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002076560485875234, "kl": 5.111098289489746e-06, "learning_rate": 1.706e-06, "loss": 0.0, "num_tokens": 1445111.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4883 }, { "clip_ratio/high_max": 0.006097560748457909, "clip_ratio/high_mean": 0.006097560748457909, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006097560748457909, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 90.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.0956783294677734, "kl": 0.05870269238948822, "learning_rate": 1.7056666666666668e-06, "loss": 0.0503, "num_tokens": 1445488.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 90.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1301628053188324, "kl": 0.015491341473534703, "learning_rate": 1.7053333333333332e-06, "loss": 0.0008, "num_tokens": 1445779.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 90.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03917824104428291, "kl": 0.0007429122815665323, "learning_rate": 1.7050000000000002e-06, "loss": 0.0, "num_tokens": 1446035.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 90.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.017919491976499557, "kl": 0.004026137758046389, "learning_rate": 1.7046666666666666e-06, "loss": 0.0002, "num_tokens": 1446303.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 90.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009305633720941842, "kl": 0.0037599578499794006, "learning_rate": 1.7043333333333333e-06, "loss": 0.0002, "num_tokens": 1446539.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 90.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.013986870646476746, "kl": 0.0006524303462356329, "learning_rate": 1.704e-06, "loss": 0.0, "num_tokens": 1446865.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 90.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02480613999068737, "kl": 0.0013298190315254033, "learning_rate": 1.7036666666666667e-06, "loss": 0.0001, "num_tokens": 1447143.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 90.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.01889585331082344, "kl": 0.004025566508062184, "learning_rate": 1.7033333333333335e-06, "loss": 0.0002, "num_tokens": 1447435.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 90.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06282459199428558, "kl": 0.0034190231235697865, "learning_rate": 1.703e-06, "loss": 0.0002, "num_tokens": 1447735.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 90.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.1463141441345215, "kl": 0.08878976106643677, "learning_rate": 1.7026666666666668e-06, "loss": 0.2138, "num_tokens": 1448056.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 4893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 90.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007869921624660492, "kl": 0.0022259624674916267, "learning_rate": 1.7023333333333334e-06, "loss": 0.0001, "num_tokens": 1448368.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 90.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.923832654953003, "kl": 0.06465357914566994, "learning_rate": 1.7020000000000002e-06, "loss": 0.115, "num_tokens": 1448727.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 4895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 90.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.015185699798166752, "kl": 0.0004437565803527832, "learning_rate": 1.7016666666666665e-06, "loss": 0.0, "num_tokens": 1448935.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.004129840526729822, "kl": 0.0015180340269580483, "learning_rate": 1.7013333333333333e-06, "loss": 0.0001, "num_tokens": 1449219.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 90.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 6.470602989196777, "kl": 0.009461591951549053, "learning_rate": 1.7009999999999999e-06, "loss": 0.2991, "num_tokens": 1449451.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 4898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 90.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.09812650829553604, "kl": 0.030571318231523037, "learning_rate": 1.7006666666666667e-06, "loss": 0.0015, "num_tokens": 1449726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.009352663531899452, "kl": 0.009455515537410975, "learning_rate": 1.7003333333333335e-06, "loss": 0.0005, "num_tokens": 1449998.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 90.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.040579233318567276, "kl": 0.007567316293716431, "learning_rate": 1.7e-06, "loss": 0.0004, "num_tokens": 1450302.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 90.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.976142406463623, "kl": 0.12730075418949127, "learning_rate": 1.6996666666666668e-06, "loss": -0.0796, "num_tokens": 1450658.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 90.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.10649600625038147, "kl": 0.014784622006118298, "learning_rate": 1.6993333333333334e-06, "loss": 0.0007, "num_tokens": 1450942.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 90.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.5189959406852722, "kl": 0.05209812615066767, "learning_rate": 1.6990000000000002e-06, "loss": 0.0033, "num_tokens": 1451216.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.41963323950767517, "kl": 0.06770718470215797, "learning_rate": 1.6986666666666665e-06, "loss": 0.0036, "num_tokens": 1451504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 90.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.021894490346312523, "kl": 0.0008557852415833622, "learning_rate": 1.6983333333333333e-06, "loss": 0.0, "num_tokens": 1451815.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 90.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849485993385315, "kl": 0.0034385338658466935, "learning_rate": 1.6979999999999999e-06, "loss": 0.0002, "num_tokens": 1452034.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 90.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.016248758882284164, "kl": 0.2645472288131714, "learning_rate": 1.6976666666666667e-06, "loss": 0.0132, "num_tokens": 1452339.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04291994124650955, "kl": 0.023891227785497904, "learning_rate": 1.6973333333333334e-06, "loss": 0.0012, "num_tokens": 1452628.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 90.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.006555619183927774, "kl": 0.0007132887840270996, "learning_rate": 1.697e-06, "loss": 0.0, "num_tokens": 1452888.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 90.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11527550965547562, "kl": 0.037596405716612935, "learning_rate": 1.6966666666666668e-06, "loss": 0.0019, "num_tokens": 1453217.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 90.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.022737320512533188, "kl": 0.05129823461174965, "learning_rate": 1.6963333333333334e-06, "loss": 0.0026, "num_tokens": 1453553.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 90.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014274314744397998, "kl": 9.586289525032043e-05, "learning_rate": 1.6960000000000002e-06, "loss": 0.0, "num_tokens": 1453797.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.02939451113343239, "kl": 0.00025314688673461205, "learning_rate": 1.6956666666666665e-06, "loss": 0.0, "num_tokens": 1454010.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 91.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010532417334616184, "kl": 0.003958088462240994, "learning_rate": 1.6953333333333335e-06, "loss": 0.0002, "num_tokens": 1454278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007757003186270595, "kl": 0.0037833750247955322, "learning_rate": 1.6949999999999999e-06, "loss": 0.0002, "num_tokens": 1454514.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 91.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.07356951385736465, "kl": 0.022042195312678814, "learning_rate": 1.6946666666666666e-06, "loss": 0.001, "num_tokens": 1454858.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 91.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.033774372190237045, "kl": 0.002994000678882003, "learning_rate": 1.6943333333333334e-06, "loss": 0.0001, "num_tokens": 1455128.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.10086127370595932, "kl": 0.0036579921725206077, "learning_rate": 1.694e-06, "loss": 0.0002, "num_tokens": 1455346.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 91.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.7861106395721436, "kl": 0.0038848338299430907, "learning_rate": 1.6936666666666668e-06, "loss": 0.4534, "num_tokens": 1455873.0, "reward": 5.300000190734863, "reward_std": 4.400000095367432, "rewards/reward_combined/mean": 5.300000190734863, "rewards/reward_combined/std": 4.400000095367432, "step": 4920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 91.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.3681552410125732, "kl": 0.335488423705101, "learning_rate": 1.6933333333333334e-06, "loss": 0.1304, "num_tokens": 1456241.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 4921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 91.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04183509573340416, "kl": 0.008128313114866614, "learning_rate": 1.6930000000000001e-06, "loss": 0.0004, "num_tokens": 1456566.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 91.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 4.6732306480407715, "kl": 0.1125209890305996, "learning_rate": 1.6926666666666665e-06, "loss": 0.0594, "num_tokens": 1456910.0, "reward": 4.625, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 4.308422088623047, "step": 4923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 91.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.006506770849227905, "kl": 0.0006705879932269454, "learning_rate": 1.6923333333333335e-06, "loss": 0.0, "num_tokens": 1457170.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 91.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.02825717069208622, "kl": 0.001499679303378798, "learning_rate": 1.6919999999999999e-06, "loss": 0.0001, "num_tokens": 1457481.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 91.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.009554890915751457, "kl": 0.00931511353701353, "learning_rate": 1.6916666666666666e-06, "loss": 0.0005, "num_tokens": 1457753.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010792806278914213, "kl": 3.1888484954833984e-06, "learning_rate": 1.6913333333333334e-06, "loss": 0.0, "num_tokens": 1457973.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 91.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.031192228198051453, "kl": 0.004844009876251221, "learning_rate": 1.691e-06, "loss": 0.0002, "num_tokens": 1458185.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 91.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0283152237534523, "kl": 0.0005697508458979428, "learning_rate": 1.6906666666666668e-06, "loss": 0.0, "num_tokens": 1458397.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 91.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.047497108578681946, "kl": 0.0030733130406588316, "learning_rate": 1.6903333333333333e-06, "loss": 0.0002, "num_tokens": 1458692.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 91.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.022882595658302307, "kl": 0.0008376652549486607, "learning_rate": 1.6900000000000001e-06, "loss": 0.0, "num_tokens": 1458970.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 91.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.009101000614464283, "kl": 0.0018309559673070908, "learning_rate": 1.6896666666666665e-06, "loss": 0.0001, "num_tokens": 1459282.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 91.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.018523814156651497, "kl": 0.0007301649311557412, "learning_rate": 1.6893333333333335e-06, "loss": 0.0, "num_tokens": 1459602.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 91.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11225615441799164, "kl": 0.016585070174187422, "learning_rate": 1.6889999999999998e-06, "loss": 0.0008, "num_tokens": 1459910.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 91.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05018152669072151, "kl": 0.009000246413052082, "learning_rate": 1.6886666666666666e-06, "loss": 0.0004, "num_tokens": 1460243.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 91.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.012371880933642387, "kl": 0.00013869106987840496, "learning_rate": 1.6883333333333334e-06, "loss": 0.0, "num_tokens": 1460499.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 91.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08162127435207367, "kl": 0.007962659932672977, "learning_rate": 1.688e-06, "loss": 0.0004, "num_tokens": 1460792.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 91.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.005772865377366543, "kl": 0.00034815073013305664, "learning_rate": 1.6876666666666668e-06, "loss": 0.0, "num_tokens": 1461052.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 91.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03673241659998894, "kl": 0.002948103239759803, "learning_rate": 1.6873333333333333e-06, "loss": 0.0002, "num_tokens": 1461334.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 91.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.008549237623810768, "kl": 0.014917390421032906, "learning_rate": 1.6870000000000001e-06, "loss": 0.0007, "num_tokens": 1461594.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 91.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.12307348847389221, "kl": 0.01311293535400182, "learning_rate": 1.6866666666666667e-06, "loss": 0.0007, "num_tokens": 1461865.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 91.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 6.816999912261963, "kl": 0.09994024876505136, "learning_rate": 1.6863333333333335e-06, "loss": -0.0093, "num_tokens": 1462154.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 4942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 91.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.020536771044135094, "kl": 0.0005666979268426076, "learning_rate": 1.6860000000000002e-06, "loss": 0.0, "num_tokens": 1462450.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 91.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06226768344640732, "kl": 0.017683228012174368, "learning_rate": 1.6856666666666666e-06, "loss": 0.0008, "num_tokens": 1462777.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 91.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05985826253890991, "kl": 0.1075504794716835, "learning_rate": 1.6853333333333336e-06, "loss": 0.0054, "num_tokens": 1463149.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 91.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07356952875852585, "kl": 0.016571279615163803, "learning_rate": 1.685e-06, "loss": 0.0008, "num_tokens": 1463499.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 91.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07550165057182312, "kl": 0.015795729123055935, "learning_rate": 1.6846666666666667e-06, "loss": 0.0008, "num_tokens": 1463783.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 91.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.3067377507686615, "kl": 0.031524146907031536, "learning_rate": 1.6843333333333333e-06, "loss": 0.0017, "num_tokens": 1464089.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 91.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07108604907989502, "kl": 0.00878575723618269, "learning_rate": 1.684e-06, "loss": 0.0004, "num_tokens": 1464349.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.006079776678234339, "kl": 0.0016200095415115356, "learning_rate": 1.6836666666666667e-06, "loss": 0.0001, "num_tokens": 1464565.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.017288338392972946, "kl": 0.0006929486989974976, "learning_rate": 1.6833333333333335e-06, "loss": 0.0, "num_tokens": 1464777.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 91.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.028068071231245995, "kl": 0.002798804547637701, "learning_rate": 1.6830000000000002e-06, "loss": 0.0001, "num_tokens": 1465037.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 91.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.951070547103882, "kl": 0.07814485020935535, "learning_rate": 1.6826666666666666e-06, "loss": 0.1023, "num_tokens": 1465409.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 91.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625244528055191, "kl": 0.010406154673546553, "learning_rate": 1.6823333333333336e-06, "loss": 0.0005, "num_tokens": 1465745.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 91.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 3.840162992477417, "kl": 0.12812123447656631, "learning_rate": 1.682e-06, "loss": -0.0599, "num_tokens": 1466119.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 4955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 91.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.00025614103651605546, "kl": 7.656589150428772e-05, "learning_rate": 1.6816666666666667e-06, "loss": 0.0, "num_tokens": 1466363.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 91.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.16323304176330566, "kl": 0.02387662325054407, "learning_rate": 1.6813333333333333e-06, "loss": 0.0013, "num_tokens": 1466663.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 91.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.2361888438463211, "kl": 0.05485841631889343, "learning_rate": 1.681e-06, "loss": 0.0028, "num_tokens": 1466984.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 91.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 4.013523578643799, "kl": 0.03875024616718292, "learning_rate": 1.6806666666666667e-06, "loss": -0.0101, "num_tokens": 1467290.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 91.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09304482489824295, "kl": 0.006097201490774751, "learning_rate": 1.6803333333333334e-06, "loss": 0.0003, "num_tokens": 1467554.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 91.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04290613904595375, "kl": 0.16057635843753815, "learning_rate": 1.6800000000000002e-06, "loss": 0.008, "num_tokens": 1467864.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 91.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.11511863023042679, "kl": 0.04005991294980049, "learning_rate": 1.6796666666666666e-06, "loss": 0.002, "num_tokens": 1468133.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 91.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.008777844719588757, "kl": 0.00017987936735153198, "learning_rate": 1.6793333333333336e-06, "loss": 0.0, "num_tokens": 1468403.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 91.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.10896319150924683, "kl": 0.025346003472805023, "learning_rate": 1.679e-06, "loss": 0.0013, "num_tokens": 1468757.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 91.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.057124137878418, "kl": 0.005520134000107646, "learning_rate": 1.6786666666666667e-06, "loss": 0.0006, "num_tokens": 1469047.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 4965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 91.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03335168585181236, "kl": 0.0008818720234557986, "learning_rate": 1.6783333333333333e-06, "loss": 0.0, "num_tokens": 1469282.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 91.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.7279026508331299, "kl": 0.052693258970975876, "learning_rate": 1.678e-06, "loss": 0.0538, "num_tokens": 1469701.0, "reward": 2.875, "reward_std": 0.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 0.25, "step": 4967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 92.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.003178240731358528, "kl": 0.2684226781129837, "learning_rate": 1.6776666666666666e-06, "loss": 0.0134, "num_tokens": 1470005.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 92.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.807964324951172, "kl": 0.027837354689836502, "learning_rate": 1.6773333333333334e-06, "loss": 0.0666, "num_tokens": 1470296.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 92.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 17.87394905090332, "kl": 0.00461736461147666, "learning_rate": 1.6770000000000002e-06, "loss": 0.1489, "num_tokens": 1470514.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 92.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.07200003415346146, "kl": 0.008428729604929686, "learning_rate": 1.6766666666666668e-06, "loss": 0.0004, "num_tokens": 1470839.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 92.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07832275331020355, "kl": 0.008064561057835817, "learning_rate": 1.6763333333333336e-06, "loss": 0.0004, "num_tokens": 1471128.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 92.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 11.649913787841797, "kl": 0.0860157017596066, "learning_rate": 1.676e-06, "loss": -0.2037, "num_tokens": 1471406.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 92.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.14575444161891937, "kl": 0.01035630349360872, "learning_rate": 1.6756666666666667e-06, "loss": 0.0005, "num_tokens": 1471662.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 92.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.772905111312866, "kl": 0.02095145918428898, "learning_rate": 1.6753333333333333e-06, "loss": -0.0029, "num_tokens": 1471954.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 4975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.013888888992369175, "clip_ratio/region_mean": 0.013888888992369175, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 92.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.8809361457824707, "kl": 0.586449109017849, "learning_rate": 1.675e-06, "loss": -0.0046, "num_tokens": 1472255.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 92.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 11.035835266113281, "kl": 0.06366511806845665, "learning_rate": 1.6746666666666666e-06, "loss": -0.0861, "num_tokens": 1472465.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 92.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.383103370666504, "kl": 0.052995434030890465, "learning_rate": 1.6743333333333334e-06, "loss": 0.0419, "num_tokens": 1472813.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 92.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.08518697321414948, "kl": 0.002585767302662134, "learning_rate": 1.6740000000000002e-06, "loss": 0.0001, "num_tokens": 1473109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 92.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.10820410400629044, "kl": 0.021352801471948624, "learning_rate": 1.6736666666666668e-06, "loss": 0.0011, "num_tokens": 1473421.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 92.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.19683966040611267, "kl": 0.05924472399055958, "learning_rate": 1.6733333333333335e-06, "loss": 0.0029, "num_tokens": 1473837.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 92.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.025471318513154984, "kl": 0.0008457452058792114, "learning_rate": 1.673e-06, "loss": 0.0, "num_tokens": 1474097.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 92.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.015883689746260643, "kl": 0.0034166210098192096, "learning_rate": 1.6726666666666667e-06, "loss": 0.0002, "num_tokens": 1474365.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 92.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.007451504934579134, "kl": 0.00198943167924881, "learning_rate": 1.6723333333333333e-06, "loss": 0.0001, "num_tokens": 1474581.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 92.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 4.134199142456055, "kl": 0.08974160626530647, "learning_rate": 1.672e-06, "loss": 0.1165, "num_tokens": 1474926.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 92.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.038822464644908905, "kl": 0.0992378257215023, "learning_rate": 1.6716666666666666e-06, "loss": 0.005, "num_tokens": 1475298.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 92.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05400337278842926, "kl": 0.02195595996454358, "learning_rate": 1.6713333333333334e-06, "loss": 0.0011, "num_tokens": 1475569.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 92.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.037973422557115555, "kl": 0.004420760742505081, "learning_rate": 1.6710000000000002e-06, "loss": 0.0002, "num_tokens": 1475829.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 92.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 3.3632804843364283e-05, "kl": 2.8908252716064453e-06, "learning_rate": 1.6706666666666668e-06, "loss": 0.0, "num_tokens": 1476049.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4989 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.009615384973585606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 92.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 7.129278182983398, "kl": 0.04073519539088011, "learning_rate": 1.6703333333333335e-06, "loss": 0.1077, "num_tokens": 1476357.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 92.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006086123757995665, "kl": 0.0038093402981758118, "learning_rate": 1.6699999999999999e-06, "loss": 0.0002, "num_tokens": 1476593.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 92.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06056710705161095, "kl": 0.03574089426547289, "learning_rate": 1.6696666666666669e-06, "loss": 0.0018, "num_tokens": 1476896.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 92.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03683551028370857, "kl": 0.0018821939593181014, "learning_rate": 1.6693333333333332e-06, "loss": 0.0001, "num_tokens": 1477115.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 92.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03492877632379532, "kl": 0.0024379646638408303, "learning_rate": 1.669e-06, "loss": 0.0001, "num_tokens": 1477391.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 92.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.06660422682762146, "kl": 0.012176299467682838, "learning_rate": 1.6686666666666666e-06, "loss": 0.0006, "num_tokens": 1477729.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 92.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02502434141933918, "kl": 0.0010655286605469882, "learning_rate": 1.6683333333333334e-06, "loss": 0.0001, "num_tokens": 1478050.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 92.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06613127142190933, "kl": 0.004655712749809027, "learning_rate": 1.6680000000000002e-06, "loss": 0.0002, "num_tokens": 1478304.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 92.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01316966861486435, "kl": 0.005927043997871806, "learning_rate": 1.6676666666666667e-06, "loss": 0.0003, "num_tokens": 1478574.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 92.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.2286741733551025, "kl": 0.1367866089567542, "learning_rate": 1.6673333333333335e-06, "loss": 0.0673, "num_tokens": 1478939.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 92.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.3375140130519867, "kl": 0.048935662023723125, "learning_rate": 1.6669999999999999e-06, "loss": 0.0035, "num_tokens": 1479222.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 92.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032447674311697483, "kl": 0.26840534806251526, "learning_rate": 1.6666666666666669e-06, "loss": 0.0134, "num_tokens": 1479526.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 92.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.13131465017795563, "kl": 0.01511222030967474, "learning_rate": 1.6663333333333332e-06, "loss": 0.0008, "num_tokens": 1479871.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 92.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03960214927792549, "kl": 0.0015117888106033206, "learning_rate": 1.666e-06, "loss": 0.0001, "num_tokens": 1480135.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 92.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.2215038537979126, "kl": 0.022577311377972364, "learning_rate": 1.6656666666666666e-06, "loss": 0.0011, "num_tokens": 1480422.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 92.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05715415999293327, "kl": 0.00910698575899005, "learning_rate": 1.6653333333333334e-06, "loss": 0.0005, "num_tokens": 1480704.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 92.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.09245792776346207, "kl": 0.04517386294901371, "learning_rate": 1.6650000000000002e-06, "loss": 0.0022, "num_tokens": 1481056.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 92.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.12053433060646057, "kl": 0.027421538718044758, "learning_rate": 1.6646666666666667e-06, "loss": 0.0014, "num_tokens": 1481410.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 92.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0312466062605381, "kl": 0.0008065802976489067, "learning_rate": 1.6643333333333335e-06, "loss": 0.0, "num_tokens": 1481726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 92.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.028037171810865402, "kl": 0.0013543638633564115, "learning_rate": 1.6639999999999999e-06, "loss": 0.0001, "num_tokens": 1481998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 92.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.1913347244262695, "kl": 0.06364433281123638, "learning_rate": 1.6636666666666669e-06, "loss": 0.0175, "num_tokens": 1482374.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 92.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.013355693779885769, "kl": 0.0016948528354987502, "learning_rate": 1.6633333333333332e-06, "loss": 0.0001, "num_tokens": 1482651.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 92.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0109371617436409, "kl": 0.014523950405418873, "learning_rate": 1.663e-06, "loss": 0.0007, "num_tokens": 1482911.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5012 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 92.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 5.241844177246094, "kl": 0.02016565576195717, "learning_rate": 1.6626666666666666e-06, "loss": -0.0023, "num_tokens": 1483239.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 92.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04952339082956314, "kl": 0.006025760900229216, "learning_rate": 1.6623333333333334e-06, "loss": 0.0003, "num_tokens": 1483541.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 92.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04403427988290787, "kl": 0.1632367968559265, "learning_rate": 1.6620000000000001e-06, "loss": 0.0082, "num_tokens": 1483850.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 92.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.003072462510317564, "kl": 0.00010261541319778189, "learning_rate": 1.6616666666666667e-06, "loss": 0.0, "num_tokens": 1484104.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 92.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03218698874115944, "kl": 0.004485756158828735, "learning_rate": 1.6613333333333335e-06, "loss": 0.0002, "num_tokens": 1484316.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 92.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07350842654705048, "kl": 0.015073230490088463, "learning_rate": 1.661e-06, "loss": 0.0008, "num_tokens": 1484608.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 92.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.041076578199863434, "kl": 0.003065375378355384, "learning_rate": 1.6606666666666668e-06, "loss": 0.0002, "num_tokens": 1484900.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 92.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.007892746478319168, "kl": 0.010176368523389101, "learning_rate": 1.6603333333333332e-06, "loss": 0.0005, "num_tokens": 1485172.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 92.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.15762390196323395, "kl": 0.005376547574996948, "learning_rate": 1.66e-06, "loss": 0.0003, "num_tokens": 1485404.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03976396843791008, "kl": 0.011428375728428364, "learning_rate": 1.6596666666666666e-06, "loss": 0.0006, "num_tokens": 1485688.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 93.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.26559591293335, "kl": 0.06295810453593731, "learning_rate": 1.6593333333333333e-06, "loss": 0.0151, "num_tokens": 1486048.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.27831727266311646, "kl": 0.031008249148726463, "learning_rate": 1.6590000000000001e-06, "loss": 0.0016, "num_tokens": 1486334.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 5.607852935791016, "kl": 0.050431785377440974, "learning_rate": 1.6586666666666667e-06, "loss": 0.0515, "num_tokens": 1486621.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 93.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03103502467274666, "kl": 0.0013500666827894747, "learning_rate": 1.6583333333333335e-06, "loss": 0.0001, "num_tokens": 1486940.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 93.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.017780333757400513, "kl": 0.0492660328745842, "learning_rate": 1.658e-06, "loss": 0.0025, "num_tokens": 1487272.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 93.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.433762788772583, "kl": 0.043206318136071786, "learning_rate": 1.6576666666666668e-06, "loss": 0.0029, "num_tokens": 1487513.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 93.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05056290328502655, "kl": 0.004398422548547387, "learning_rate": 1.6573333333333332e-06, "loss": 0.0002, "num_tokens": 1487801.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 93.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013421811163425446, "kl": 4.9064554332289845e-05, "learning_rate": 1.657e-06, "loss": 0.0, "num_tokens": 1488073.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 93.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.031043440103530884, "kl": 0.00475098192691803, "learning_rate": 1.6566666666666665e-06, "loss": 0.0002, "num_tokens": 1488285.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 93.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 1.1078578233718872, "kl": 0.47988858609460294, "learning_rate": 1.6563333333333333e-06, "loss": 0.0241, "num_tokens": 1488567.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 93.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008925163419917226, "kl": 0.00011397525668144226, "learning_rate": 1.6560000000000001e-06, "loss": 0.0, "num_tokens": 1488811.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08188676834106445, "kl": 0.009675647597759962, "learning_rate": 1.6556666666666667e-06, "loss": 0.0005, "num_tokens": 1489115.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 93.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.011076646856963634, "kl": 0.014485355466604233, "learning_rate": 1.6553333333333335e-06, "loss": 0.0007, "num_tokens": 1489375.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 93.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.05944862961769104, "kl": 0.00756123336032033, "learning_rate": 1.655e-06, "loss": 0.0004, "num_tokens": 1489714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 93.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.024482984095811844, "kl": 0.001643153140321374, "learning_rate": 1.6546666666666668e-06, "loss": 0.0001, "num_tokens": 1489994.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 93.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 8.84996509552002, "kl": 0.019488862904836424, "learning_rate": 1.6543333333333332e-06, "loss": 0.1299, "num_tokens": 1490215.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 93.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.11664830148220062, "kl": 0.04044315405189991, "learning_rate": 1.6540000000000002e-06, "loss": 0.002, "num_tokens": 1490517.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 93.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.052643537521362305, "kl": 0.06707258895039558, "learning_rate": 1.6536666666666665e-06, "loss": 0.0034, "num_tokens": 1490895.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 93.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03135155141353607, "kl": 0.04444164037704468, "learning_rate": 1.6533333333333333e-06, "loss": 0.0022, "num_tokens": 1491299.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 93.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.2690921127796173, "kl": 0.03421100229024887, "learning_rate": 1.653e-06, "loss": 0.0017, "num_tokens": 1491505.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 93.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.8561997413635254, "kl": 0.07261555641889572, "learning_rate": 1.6526666666666667e-06, "loss": 0.0041, "num_tokens": 1491843.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 93.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.026965666562318802, "kl": 0.0013179140514694154, "learning_rate": 1.6523333333333335e-06, "loss": 0.0001, "num_tokens": 1492111.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 93.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.09048201888799667, "kl": 0.0154511583968997, "learning_rate": 1.652e-06, "loss": 0.0009, "num_tokens": 1492390.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 93.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07614016532897949, "kl": 0.0038604214787483215, "learning_rate": 1.6516666666666668e-06, "loss": 0.0002, "num_tokens": 1492602.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 93.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04271329939365387, "kl": 0.0010043196380138397, "learning_rate": 1.6513333333333332e-06, "loss": 0.0001, "num_tokens": 1492862.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 93.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01605292037129402, "kl": 0.00045746201067231596, "learning_rate": 1.6510000000000002e-06, "loss": 0.0, "num_tokens": 1493178.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 93.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.003291687462478876, "kl": 0.26838211715221405, "learning_rate": 1.6506666666666665e-06, "loss": 0.0134, "num_tokens": 1493482.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 93.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.028504453599452972, "kl": 0.0004374742457002867, "learning_rate": 1.6503333333333333e-06, "loss": 0.0, "num_tokens": 1493738.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 93.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 1.340577483177185, "kl": 0.10927124321460724, "learning_rate": 1.65e-06, "loss": 0.0065, "num_tokens": 1493956.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 93.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01485071238130331, "kl": 0.001964425668120384, "learning_rate": 1.6496666666666667e-06, "loss": 0.0001, "num_tokens": 1494268.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 93.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.12154436111450195, "kl": 0.019749329425394535, "learning_rate": 1.6493333333333334e-06, "loss": 0.001, "num_tokens": 1494562.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 93.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.277632236480713, "kl": 0.1807153820991516, "learning_rate": 1.649e-06, "loss": -0.0239, "num_tokens": 1494869.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 93.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.049506545066833496, "kl": 0.014848444610834122, "learning_rate": 1.6486666666666668e-06, "loss": 0.0007, "num_tokens": 1495166.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 93.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.047044917941093445, "kl": 0.01242524420376867, "learning_rate": 1.6483333333333332e-06, "loss": 0.0006, "num_tokens": 1495488.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 93.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03062351793050766, "kl": 0.004175095586106181, "learning_rate": 1.6480000000000001e-06, "loss": 0.0002, "num_tokens": 1495786.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 93.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 4.3765788078308105, "kl": 0.5038115493953228, "learning_rate": 1.6476666666666665e-06, "loss": 0.0664, "num_tokens": 1496116.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 5058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 93.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 5.166257858276367, "kl": 0.01836752239614725, "learning_rate": 1.6473333333333333e-06, "loss": 0.2176, "num_tokens": 1496461.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 93.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017637595010455698, "kl": 5.759298801422119e-06, "learning_rate": 1.647e-06, "loss": 0.0, "num_tokens": 1496681.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 93.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05268222838640213, "kl": 0.019020277075469494, "learning_rate": 1.6466666666666666e-06, "loss": 0.001, "num_tokens": 1497040.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 93.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.9249795079231262, "kl": 0.21187730878591537, "learning_rate": 1.6463333333333334e-06, "loss": 0.0127, "num_tokens": 1497447.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 93.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.10928282141685486, "kl": 0.010948408860713243, "learning_rate": 1.646e-06, "loss": 0.0005, "num_tokens": 1497715.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 93.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.020056122913956642, "kl": 0.003073722356930375, "learning_rate": 1.6456666666666668e-06, "loss": 0.0002, "num_tokens": 1498045.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 93.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03642801195383072, "kl": 0.007461420493200421, "learning_rate": 1.6453333333333333e-06, "loss": 0.0004, "num_tokens": 1498368.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 93.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.011308939196169376, "kl": 0.0013963497476652265, "learning_rate": 1.6450000000000001e-06, "loss": 0.0001, "num_tokens": 1498638.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 93.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03301926702260971, "kl": 0.003885791782522574, "learning_rate": 1.644666666666667e-06, "loss": 0.0002, "num_tokens": 1498896.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 93.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05412767454981804, "kl": 0.02548685297369957, "learning_rate": 1.6443333333333333e-06, "loss": 0.0013, "num_tokens": 1499169.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 93.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.027327850461006165, "kl": 0.0021450609201565385, "learning_rate": 1.6440000000000003e-06, "loss": 0.0001, "num_tokens": 1499429.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 93.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.13876545429229736, "kl": 0.04095316492021084, "learning_rate": 1.6436666666666666e-06, "loss": 0.002, "num_tokens": 1499748.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.007521340623497963, "kl": 0.01020409632474184, "learning_rate": 1.6433333333333334e-06, "loss": 0.0005, "num_tokens": 1500020.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 93.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.000577855680603534, "kl": 0.0038079768419265747, "learning_rate": 1.643e-06, "loss": 0.0002, "num_tokens": 1500256.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 93.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.3283426761627197, "kl": 0.053489550948143005, "learning_rate": 1.6426666666666668e-06, "loss": -0.034, "num_tokens": 1500618.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 93.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.780770778656006, "kl": 0.03146049380302429, "learning_rate": 1.6423333333333333e-06, "loss": 0.0773, "num_tokens": 1500894.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.026120349764823914, "kl": 0.007943099364638329, "learning_rate": 1.6420000000000001e-06, "loss": 0.0004, "num_tokens": 1501172.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 94.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0223385076969862, "kl": 0.0049424098688177764, "learning_rate": 1.641666666666667e-06, "loss": 0.0002, "num_tokens": 1501465.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 94.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1057843565940857, "kl": 0.006456733332015574, "learning_rate": 1.6413333333333333e-06, "loss": 0.0003, "num_tokens": 1501735.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 94.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.7795674800872803, "kl": 0.07165789604187012, "learning_rate": 1.6410000000000003e-06, "loss": 0.0025, "num_tokens": 1502017.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 94.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06185257434844971, "kl": 0.00764935789629817, "learning_rate": 1.6406666666666666e-06, "loss": 0.0004, "num_tokens": 1502348.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 94.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.08810275048017502, "kl": 0.00854942761361599, "learning_rate": 1.6403333333333334e-06, "loss": 0.0004, "num_tokens": 1502616.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 94.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.20623786747455597, "kl": 0.031064768321812153, "learning_rate": 1.64e-06, "loss": 0.0019, "num_tokens": 1502900.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 94.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.12593761086463928, "kl": 0.012586106546223164, "learning_rate": 1.6396666666666668e-06, "loss": 0.0006, "num_tokens": 1503195.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 94.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.01429614145308733, "kl": 0.0005785822868347168, "learning_rate": 1.6393333333333333e-06, "loss": 0.0, "num_tokens": 1503407.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 94.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.658608913421631, "kl": 0.0704478845000267, "learning_rate": 1.639e-06, "loss": 0.0446, "num_tokens": 1503725.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 94.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.04747282713651657, "kl": 0.017318569123744965, "learning_rate": 1.6386666666666669e-06, "loss": 0.0009, "num_tokens": 1504025.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 94.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 1.3169313669204712, "kl": 0.22855431586503983, "learning_rate": 1.6383333333333332e-06, "loss": 0.0293, "num_tokens": 1504433.0, "reward": 2.875, "reward_std": 0.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 0.25, "step": 5086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 94.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.659715414047241, "kl": 0.15037638694047928, "learning_rate": 1.6380000000000002e-06, "loss": -0.0188, "num_tokens": 1504773.0, "reward": 3.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 3.674234628677368, "step": 5087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 94.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03332719951868057, "kl": 0.004549508390482515, "learning_rate": 1.6376666666666666e-06, "loss": 0.0002, "num_tokens": 1505075.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 94.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.11035460978746414, "kl": 0.022511586954351515, "learning_rate": 1.6373333333333334e-06, "loss": 0.0011, "num_tokens": 1505341.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 94.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07171207666397095, "kl": 0.011405151803046465, "learning_rate": 1.637e-06, "loss": 0.0006, "num_tokens": 1505671.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 94.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.1093517541885376, "kl": 0.006516335415653884, "learning_rate": 1.6366666666666667e-06, "loss": 0.0003, "num_tokens": 1505995.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 94.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.038217782974243164, "kl": 0.005558681208640337, "learning_rate": 1.6363333333333333e-06, "loss": 0.0003, "num_tokens": 1506285.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 94.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05555345118045807, "kl": 0.001620567578356713, "learning_rate": 1.636e-06, "loss": 0.0001, "num_tokens": 1506581.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 94.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.09022354334592819, "kl": 0.0028458016458898783, "learning_rate": 1.6356666666666669e-06, "loss": 0.0001, "num_tokens": 1506855.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 94.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07398438453674316, "kl": 0.0136982761323452, "learning_rate": 1.6353333333333334e-06, "loss": 0.0007, "num_tokens": 1507179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 94.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03835136443376541, "kl": 0.0019263034919276834, "learning_rate": 1.6350000000000002e-06, "loss": 0.0001, "num_tokens": 1507500.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 94.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07044315338134766, "kl": 0.0024491348303854465, "learning_rate": 1.6346666666666666e-06, "loss": 0.0001, "num_tokens": 1507734.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 94.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.664107084274292, "kl": 0.009934463538229465, "learning_rate": 1.6343333333333334e-06, "loss": 0.0353, "num_tokens": 1508023.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 94.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012818897375836968, "kl": 0.00024852753267623484, "learning_rate": 1.634e-06, "loss": 0.0, "num_tokens": 1508243.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 94.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07100926339626312, "kl": 0.01244360813871026, "learning_rate": 1.6336666666666667e-06, "loss": 0.0006, "num_tokens": 1508529.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.75, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 64.75, "completions/mean_terminated_length": 64.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 94.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.1297500133514404, "kl": 0.0583345852792263, "learning_rate": 1.6333333333333333e-06, "loss": 0.2886, "num_tokens": 1509004.0, "reward": 6.375, "reward_std": 3.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 3.25, "step": 5101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 94.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022757509723305702, "kl": 0.00015012547373771667, "learning_rate": 1.633e-06, "loss": 0.0, "num_tokens": 1509248.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 94.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05654054507613182, "kl": 0.011175133055076003, "learning_rate": 1.6326666666666669e-06, "loss": 0.0006, "num_tokens": 1509590.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 94.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03896370530128479, "kl": 0.00512346881441772, "learning_rate": 1.6323333333333334e-06, "loss": 0.0003, "num_tokens": 1509859.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 94.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008043305017054081, "kl": 0.07291961647570133, "learning_rate": 1.6320000000000002e-06, "loss": 0.0037, "num_tokens": 1510229.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 94.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.012667941860854626, "kl": 0.002352175652049482, "learning_rate": 1.6316666666666666e-06, "loss": 0.0001, "num_tokens": 1510506.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 94.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038390362169593573, "kl": 0.26831941306591034, "learning_rate": 1.6313333333333334e-06, "loss": 0.0134, "num_tokens": 1510810.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 94.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.051037311553955, "kl": 0.17304487526416779, "learning_rate": 1.631e-06, "loss": -0.0328, "num_tokens": 1511184.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 94.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.16897113621234894, "kl": 0.1642039492726326, "learning_rate": 1.6306666666666667e-06, "loss": 0.0082, "num_tokens": 1511498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 94.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.006400995887815952, "kl": 0.00047546329733449966, "learning_rate": 1.6303333333333333e-06, "loss": 0.0, "num_tokens": 1511758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 94.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.053089018911123276, "kl": 0.0031430646777153015, "learning_rate": 1.63e-06, "loss": 0.0001, "num_tokens": 1512012.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 94.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0484151653945446, "kl": 0.027952161617577076, "learning_rate": 1.6296666666666668e-06, "loss": 0.0014, "num_tokens": 1512286.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 94.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 5.124202251434326, "kl": 0.05679469741880894, "learning_rate": 1.6293333333333334e-06, "loss": 0.0574, "num_tokens": 1512609.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 5113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 94.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.042268916964530945, "kl": 0.02734996471554041, "learning_rate": 1.6290000000000002e-06, "loss": 0.0015, "num_tokens": 1512973.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 94.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.10678981244564056, "kl": 0.051696695387363434, "learning_rate": 1.6286666666666666e-06, "loss": 0.0023, "num_tokens": 1513327.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 94.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02062048763036728, "kl": 0.000997929397271946, "learning_rate": 1.6283333333333336e-06, "loss": 0.0, "num_tokens": 1513636.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5116 }, { "clip_ratio/high_max": 0.007936508394777775, "clip_ratio/high_mean": 0.007936508394777775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007936508394777775, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 94.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.1902663707733154, "kl": 0.03688059840351343, "learning_rate": 1.628e-06, "loss": 0.1702, "num_tokens": 1514015.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 94.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01191367581486702, "kl": 0.014215979259461164, "learning_rate": 1.6276666666666667e-06, "loss": 0.0007, "num_tokens": 1514275.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 94.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005249172449112, "kl": 0.009833820164203644, "learning_rate": 1.6273333333333333e-06, "loss": 0.0005, "num_tokens": 1514545.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 94.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.4023919403553009, "kl": 0.029481276869773865, "learning_rate": 1.627e-06, "loss": 0.0015, "num_tokens": 1514753.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 94.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.06025628000497818, "kl": 0.011574001866392791, "learning_rate": 1.6266666666666668e-06, "loss": 0.0005, "num_tokens": 1515026.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 94.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.032000113278627396, "kl": 0.000591340649407357, "learning_rate": 1.6263333333333334e-06, "loss": 0.0, "num_tokens": 1515282.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 94.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00019223337585572153, "kl": 6.541609764099121e-06, "learning_rate": 1.6260000000000002e-06, "loss": 0.0, "num_tokens": 1515502.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 94.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007296347175724804, "kl": 0.003776274621486664, "learning_rate": 1.6256666666666665e-06, "loss": 0.0002, "num_tokens": 1515738.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 94.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10547690838575363, "kl": 0.00818365067243576, "learning_rate": 1.6253333333333335e-06, "loss": 0.0004, "num_tokens": 1515954.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 94.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.031101077795028687, "kl": 0.0028826892375946045, "learning_rate": 1.625e-06, "loss": 0.0001, "num_tokens": 1516166.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 94.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.014484002254903316, "kl": 0.0019085241947323084, "learning_rate": 1.6246666666666667e-06, "loss": 0.0001, "num_tokens": 1516480.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 94.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.21377907693386078, "kl": 0.04807031853124499, "learning_rate": 1.6243333333333333e-06, "loss": 0.0024, "num_tokens": 1516770.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 94.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.3453080952167511, "kl": 0.02882286161184311, "learning_rate": 1.624e-06, "loss": 0.002, "num_tokens": 1517039.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 95.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.12480010837316513, "kl": 0.017304659821093082, "learning_rate": 1.6236666666666668e-06, "loss": 0.0009, "num_tokens": 1517332.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 95.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010862167924642563, "kl": 0.00013622641017718706, "learning_rate": 1.6233333333333334e-06, "loss": 0.0, "num_tokens": 1517588.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 95.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.13647964596748352, "kl": 0.013996335212141275, "learning_rate": 1.6230000000000002e-06, "loss": 0.0008, "num_tokens": 1517855.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 95.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04975675791501999, "kl": 0.011340032564476132, "learning_rate": 1.6226666666666665e-06, "loss": 0.0006, "num_tokens": 1518133.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 95.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.015183904208242893, "kl": 0.0004988627406419255, "learning_rate": 1.6223333333333335e-06, "loss": 0.0, "num_tokens": 1518442.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 95.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.36430010199546814, "kl": 0.02252253331243992, "learning_rate": 1.6219999999999999e-06, "loss": 0.0014, "num_tokens": 1518711.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 95.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.011199631728231907, "kl": 0.014363312162458897, "learning_rate": 1.6216666666666667e-06, "loss": 0.0007, "num_tokens": 1518971.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 95.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05741075798869133, "kl": 0.008718229364603758, "learning_rate": 1.6213333333333332e-06, "loss": 0.0004, "num_tokens": 1519244.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 95.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.003971273545175791, "kl": 0.2682976573705673, "learning_rate": 1.621e-06, "loss": 0.0134, "num_tokens": 1519548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 95.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.07563067972660065, "kl": 0.018135390244424343, "learning_rate": 1.6206666666666668e-06, "loss": 0.0009, "num_tokens": 1519840.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 95.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.039467714726924896, "kl": 0.001513257622718811, "learning_rate": 1.6203333333333334e-06, "loss": 0.0001, "num_tokens": 1520084.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 95.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03072650358080864, "kl": 0.002341926097869873, "learning_rate": 1.6200000000000002e-06, "loss": 0.0001, "num_tokens": 1520296.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 95.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.15167327225208282, "kl": 0.014023125171661377, "learning_rate": 1.6196666666666667e-06, "loss": 0.0007, "num_tokens": 1520508.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 95.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0114447558298707, "kl": 0.0013392396504059434, "learning_rate": 1.6193333333333335e-06, "loss": 0.0001, "num_tokens": 1520806.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 95.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.03400212526321411, "kl": 0.0018738221260719001, "learning_rate": 1.6189999999999999e-06, "loss": 0.0001, "num_tokens": 1521096.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 95.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.08252135664224625, "kl": 0.00547359639313072, "learning_rate": 1.6186666666666667e-06, "loss": 0.0003, "num_tokens": 1521362.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 95.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.09761747717857361, "kl": 0.014867460820823908, "learning_rate": 1.6183333333333332e-06, "loss": 0.0008, "num_tokens": 1521686.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 95.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 5.120587348937988, "kl": 0.16405999660491943, "learning_rate": 1.618e-06, "loss": -0.2575, "num_tokens": 1522003.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 5147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 95.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.12070372700691223, "kl": 0.17373964935541153, "learning_rate": 1.6176666666666668e-06, "loss": 0.0088, "num_tokens": 1522318.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 95.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024428071919828653, "kl": 0.0003391765058040619, "learning_rate": 1.6173333333333334e-06, "loss": 0.0, "num_tokens": 1522578.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 95.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07362038642168045, "kl": 0.0026927399449050426, "learning_rate": 1.6170000000000001e-06, "loss": 0.0001, "num_tokens": 1522812.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 95.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.200491428375244, "kl": 0.04430763237178326, "learning_rate": 1.6166666666666667e-06, "loss": 0.1794, "num_tokens": 1523098.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 95.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03638202324509621, "kl": 0.006569494726136327, "learning_rate": 1.6163333333333335e-06, "loss": 0.0003, "num_tokens": 1523432.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 95.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.450490713119507, "kl": 0.07818220183253288, "learning_rate": 1.6159999999999999e-06, "loss": -0.1063, "num_tokens": 1523795.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 95.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.0378687381744385, "kl": 0.06665090471506119, "learning_rate": 1.6156666666666666e-06, "loss": 0.0043, "num_tokens": 1524178.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 5154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 95.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03414076566696167, "kl": 0.000407390296459198, "learning_rate": 1.6153333333333332e-06, "loss": 0.0, "num_tokens": 1524390.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 95.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.12677936255931854, "kl": 0.02297324687242508, "learning_rate": 1.615e-06, "loss": 0.0012, "num_tokens": 1524724.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 95.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05070136860013008, "kl": 0.004233626881614327, "learning_rate": 1.6146666666666668e-06, "loss": 0.0002, "num_tokens": 1525028.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 95.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008868768927641213, "kl": 0.0037405937910079956, "learning_rate": 1.6143333333333333e-06, "loss": 0.0002, "num_tokens": 1525264.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 95.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.005571421701461077, "kl": 0.001623174932319671, "learning_rate": 1.6140000000000001e-06, "loss": 0.0001, "num_tokens": 1525544.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 95.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.09620469808578491, "kl": 0.013978281989693642, "learning_rate": 1.6136666666666667e-06, "loss": 0.0007, "num_tokens": 1525856.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 95.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.103268563747406, "kl": 0.011964778881520033, "learning_rate": 1.6133333333333335e-06, "loss": 0.0006, "num_tokens": 1526164.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 95.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 1.62724769115448, "kl": 0.08902581129223108, "learning_rate": 1.6129999999999998e-06, "loss": 0.0044, "num_tokens": 1526470.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 54.75, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 95.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.4955475330352783, "kl": 0.11508592963218689, "learning_rate": 1.6126666666666666e-06, "loss": 0.1889, "num_tokens": 1526905.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 95.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.25937536358833313, "kl": 0.04300510138273239, "learning_rate": 1.6123333333333332e-06, "loss": 0.0022, "num_tokens": 1527204.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 95.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.5185606479644775, "kl": 0.08510687947273254, "learning_rate": 1.612e-06, "loss": 0.0502, "num_tokens": 1527554.0, "reward": 4.25, "reward_std": 4.27200174331665, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 4.27200174331665, "step": 5165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 95.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03310784697532654, "kl": 0.0032910079462453723, "learning_rate": 1.6116666666666668e-06, "loss": 0.0002, "num_tokens": 1527814.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 95.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.22392171621322632, "kl": 0.023513258900493383, "learning_rate": 1.6113333333333333e-06, "loss": 0.0012, "num_tokens": 1528090.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 95.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042220475152134895, "kl": 0.0002702832280192524, "learning_rate": 1.6110000000000001e-06, "loss": 0.0, "num_tokens": 1528310.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 95.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.055837348103523254, "kl": 0.0033182734914589673, "learning_rate": 1.6106666666666667e-06, "loss": 0.0001, "num_tokens": 1528584.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 95.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.4184170067310333, "kl": 0.06885102717205882, "learning_rate": 1.6103333333333335e-06, "loss": 0.0032, "num_tokens": 1528911.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 95.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.16252626478672028, "kl": 0.05184198170900345, "learning_rate": 1.6099999999999998e-06, "loss": 0.0027, "num_tokens": 1529216.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 95.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03228530287742615, "kl": 0.005028032814152539, "learning_rate": 1.6096666666666668e-06, "loss": 0.0003, "num_tokens": 1529498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 95.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.00021318643121048808, "kl": 7.450580596923828e-06, "learning_rate": 1.6093333333333332e-06, "loss": 0.0, "num_tokens": 1529718.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 95.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.12534946203231812, "kl": 0.020774316042661667, "learning_rate": 1.609e-06, "loss": 0.001, "num_tokens": 1529993.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 95.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.024069350212812424, "kl": 0.0032211471116170287, "learning_rate": 1.6086666666666668e-06, "loss": 0.0002, "num_tokens": 1530321.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 95.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03737330436706543, "kl": 0.10044170543551445, "learning_rate": 1.6083333333333333e-06, "loss": 0.005, "num_tokens": 1530693.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 95.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0907745435833931, "kl": 0.04705662652850151, "learning_rate": 1.608e-06, "loss": 0.0023, "num_tokens": 1531055.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 95.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.016896385699510574, "kl": 0.0008772032451815903, "learning_rate": 1.6076666666666667e-06, "loss": 0.0, "num_tokens": 1531372.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 95.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.31291264295578003, "kl": 0.02850928157567978, "learning_rate": 1.6073333333333335e-06, "loss": 0.0015, "num_tokens": 1531646.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 95.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.1370113044977188, "kl": 0.017792532220482826, "learning_rate": 1.6069999999999998e-06, "loss": 0.0009, "num_tokens": 1531934.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 95.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06672362238168716, "kl": 0.0038387924432754517, "learning_rate": 1.6066666666666668e-06, "loss": 0.0002, "num_tokens": 1532150.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 95.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05014263838529587, "kl": 0.01246769493445754, "learning_rate": 1.6063333333333332e-06, "loss": 0.0006, "num_tokens": 1532444.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 95.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01600436121225357, "kl": 0.0005118479311931878, "learning_rate": 1.606e-06, "loss": 0.0, "num_tokens": 1532714.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 96.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03981270268559456, "kl": 0.056245286017656326, "learning_rate": 1.6056666666666667e-06, "loss": 0.0028, "num_tokens": 1533119.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 96.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 4.256351470947266, "kl": 0.7909174561500549, "learning_rate": 1.6053333333333333e-06, "loss": 0.0387, "num_tokens": 1533422.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 96.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 7.734254360198975, "kl": 0.1715548001229763, "learning_rate": 1.605e-06, "loss": 0.0486, "num_tokens": 1533720.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 96.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.4434525966644287, "kl": 0.1689465567469597, "learning_rate": 1.6046666666666667e-06, "loss": -0.0827, "num_tokens": 1534088.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 5187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 96.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03285718336701393, "kl": 0.0008119975100271404, "learning_rate": 1.6043333333333334e-06, "loss": 0.0, "num_tokens": 1534352.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 96.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.1380830556154251, "kl": 0.025421341881155968, "learning_rate": 1.604e-06, "loss": 0.0013, "num_tokens": 1534696.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 96.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.5371273159980774, "kl": 0.13559747487306595, "learning_rate": 1.6036666666666668e-06, "loss": 0.0069, "num_tokens": 1535036.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 96.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.086024284362793, "kl": 0.03677371144294739, "learning_rate": 1.6033333333333336e-06, "loss": 0.1202, "num_tokens": 1535308.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 96.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008534154039807618, "kl": 0.0037450119853019714, "learning_rate": 1.603e-06, "loss": 0.0002, "num_tokens": 1535544.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03920384496450424, "kl": 0.007517733611166477, "learning_rate": 1.602666666666667e-06, "loss": 0.0003, "num_tokens": 1535836.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 96.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.013419232331216335, "kl": 0.0004963747051078826, "learning_rate": 1.6023333333333333e-06, "loss": 0.0, "num_tokens": 1536072.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 96.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.00020311851403675973, "kl": 7.27921724319458e-06, "learning_rate": 1.602e-06, "loss": 0.0, "num_tokens": 1536292.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 96.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.9683189392089844, "kl": 0.028609320521354675, "learning_rate": 1.6016666666666666e-06, "loss": 0.0539, "num_tokens": 1536586.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 96.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 4.197132587432861, "kl": 0.05988541431725025, "learning_rate": 1.6013333333333334e-06, "loss": 0.236, "num_tokens": 1536951.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 96.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.23637382686138153, "kl": 0.04909998178482056, "learning_rate": 1.601e-06, "loss": 0.0026, "num_tokens": 1537290.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.124506950378418, "kl": 0.01471470925025642, "learning_rate": 1.6006666666666668e-06, "loss": 0.0452, "num_tokens": 1537576.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 96.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.13501040637493134, "kl": 0.014340505935251713, "learning_rate": 1.6003333333333336e-06, "loss": 0.0007, "num_tokens": 1537848.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 96.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03232679143548012, "kl": 0.0013084628735668957, "learning_rate": 1.6e-06, "loss": 0.0001, "num_tokens": 1538144.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 96.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.008099677972495556, "kl": 0.0001684397502685897, "learning_rate": 1.599666666666667e-06, "loss": 0.0, "num_tokens": 1538400.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 96.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07905364036560059, "kl": 0.016734112985432148, "learning_rate": 1.5993333333333333e-06, "loss": 0.0009, "num_tokens": 1538734.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10110728442668915, "kl": 0.014798803720623255, "learning_rate": 1.599e-06, "loss": 0.0007, "num_tokens": 1539025.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 96.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.11827294528484344, "kl": 0.0057364702224731445, "learning_rate": 1.5986666666666666e-06, "loss": 0.0003, "num_tokens": 1539231.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 96.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02143985405564308, "kl": 0.004726713988929987, "learning_rate": 1.5983333333333334e-06, "loss": 0.0002, "num_tokens": 1539489.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 96.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05490705743432045, "kl": 0.016320059075951576, "learning_rate": 1.598e-06, "loss": 0.0008, "num_tokens": 1539788.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.003941059112549, "kl": 0.01771488878875971, "learning_rate": 1.5976666666666668e-06, "loss": 0.0905, "num_tokens": 1540082.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07804497331380844, "kl": 0.0273137129843235, "learning_rate": 1.5973333333333336e-06, "loss": 0.0014, "num_tokens": 1540354.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 96.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.043495986610651016, "kl": 0.007365534518612549, "learning_rate": 1.597e-06, "loss": 0.0004, "num_tokens": 1540624.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.09378044307231903, "kl": 0.007655891356989741, "learning_rate": 1.596666666666667e-06, "loss": 0.0004, "num_tokens": 1540926.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 96.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.027063585817813873, "kl": 0.0014878429647069424, "learning_rate": 1.5963333333333333e-06, "loss": 0.0001, "num_tokens": 1541238.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 96.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.1158662810921669, "kl": 0.061968524008989334, "learning_rate": 1.596e-06, "loss": 0.0031, "num_tokens": 1541589.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 96.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.020407572388648987, "kl": 0.09662141278386116, "learning_rate": 1.5956666666666666e-06, "loss": 0.0048, "num_tokens": 1541961.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 96.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.030843282118439674, "kl": 0.0003384128212928772, "learning_rate": 1.5953333333333334e-06, "loss": 0.0, "num_tokens": 1542173.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 96.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.029449859634041786, "kl": 0.0008158758282661438, "learning_rate": 1.595e-06, "loss": 0.0, "num_tokens": 1542433.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 96.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04036524146795273, "kl": 0.018667737022042274, "learning_rate": 1.5946666666666668e-06, "loss": 0.0009, "num_tokens": 1542804.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.319491386413574, "kl": 0.020450257696211338, "learning_rate": 1.5943333333333335e-06, "loss": 0.1355, "num_tokens": 1543097.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 96.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.023998115211725235, "kl": 0.0037993593141436577, "learning_rate": 1.5940000000000001e-06, "loss": 0.0002, "num_tokens": 1543429.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.029288535937666893, "kl": 0.0012310373422224075, "learning_rate": 1.593666666666667e-06, "loss": 0.0001, "num_tokens": 1543753.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.0808610916137695, "kl": 0.0095128309330903, "learning_rate": 1.5933333333333333e-06, "loss": 0.0633, "num_tokens": 1544037.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 96.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.05988513305783272, "kl": 0.014316507615149021, "learning_rate": 1.593e-06, "loss": 0.0008, "num_tokens": 1544311.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 96.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.010522710159420967, "kl": 0.0015555593417957425, "learning_rate": 1.5926666666666666e-06, "loss": 0.0001, "num_tokens": 1544579.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 96.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.1397678554058075, "kl": 0.016374513506889343, "learning_rate": 1.5923333333333334e-06, "loss": 0.0008, "num_tokens": 1544901.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 96.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.006504387594759464, "kl": 0.0015593841671943665, "learning_rate": 1.592e-06, "loss": 0.0001, "num_tokens": 1545117.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 96.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05867267772555351, "kl": 0.0030096396803855896, "learning_rate": 1.5916666666666667e-06, "loss": 0.0002, "num_tokens": 1545366.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 96.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.1795063614845276, "kl": 0.00440611457452178, "learning_rate": 1.5913333333333335e-06, "loss": 0.0003, "num_tokens": 1545586.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 96.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.08656342327594757, "kl": 0.009550884831696749, "learning_rate": 1.591e-06, "loss": 0.0005, "num_tokens": 1545913.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 96.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012767196167260408, "kl": 0.00024462938745273277, "learning_rate": 1.5906666666666669e-06, "loss": 0.0, "num_tokens": 1546133.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 96.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.990260601043701, "kl": 0.10891515691764653, "learning_rate": 1.5903333333333332e-06, "loss": 0.1837, "num_tokens": 1546491.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 5230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.027175987139344215, "kl": 0.009174252860248089, "learning_rate": 1.59e-06, "loss": 0.0005, "num_tokens": 1546765.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 96.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0720847100019455, "kl": 0.0019926356617361307, "learning_rate": 1.5896666666666666e-06, "loss": 0.0001, "num_tokens": 1547035.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 96.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.2600727081298828, "kl": 0.04558689740952104, "learning_rate": 1.5893333333333334e-06, "loss": 0.0025, "num_tokens": 1547297.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 96.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0636097639799118, "kl": 0.016690427903085947, "learning_rate": 1.589e-06, "loss": 0.0008, "num_tokens": 1547559.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.10395558178424835, "kl": 0.03940185043029487, "learning_rate": 1.5886666666666667e-06, "loss": 0.002, "num_tokens": 1547849.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 96.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.11616316437721252, "kl": 0.05716337263584137, "learning_rate": 1.5883333333333335e-06, "loss": 0.0028, "num_tokens": 1548253.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 96.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.019060350954532623, "kl": 0.0007412591949105263, "learning_rate": 1.588e-06, "loss": 0.0, "num_tokens": 1548565.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 97.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.015772633254528046, "kl": 0.16136373579502106, "learning_rate": 1.5876666666666669e-06, "loss": 0.0081, "num_tokens": 1548873.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 97.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05440906435251236, "kl": 0.008499347837641835, "learning_rate": 1.5873333333333332e-06, "loss": 0.0004, "num_tokens": 1549161.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.039005979895591736, "kl": 0.0015152791747823358, "learning_rate": 1.5870000000000002e-06, "loss": 0.0001, "num_tokens": 1549431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 97.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.855672836303711, "kl": 0.0875339973717928, "learning_rate": 1.5866666666666666e-06, "loss": -0.0825, "num_tokens": 1549757.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.025368299335241318, "kl": 0.0020077545195817947, "learning_rate": 1.5863333333333334e-06, "loss": 0.0001, "num_tokens": 1550010.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 97.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.024747377261519432, "kl": 0.005036524264141917, "learning_rate": 1.586e-06, "loss": 0.0003, "num_tokens": 1550346.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.32676878571510315, "kl": 0.08721771091222763, "learning_rate": 1.5856666666666667e-06, "loss": 0.0047, "num_tokens": 1550640.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 97.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.2946219444274902, "kl": 0.032050661742687225, "learning_rate": 1.5853333333333335e-06, "loss": -0.0327, "num_tokens": 1551024.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 97.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1397843360900879, "kl": 0.03741133585572243, "learning_rate": 1.585e-06, "loss": 0.0019, "num_tokens": 1551308.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 97.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.7527925968170166, "kl": 0.08169154822826385, "learning_rate": 1.5846666666666669e-06, "loss": -0.0154, "num_tokens": 1551670.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 97.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.2162325382232666, "kl": 0.029286948963999748, "learning_rate": 1.5843333333333332e-06, "loss": 0.0234, "num_tokens": 1551964.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 97.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.15857195854187, "kl": 0.5796295739710331, "learning_rate": 1.5840000000000002e-06, "loss": 0.019, "num_tokens": 1552259.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 97.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08338232338428497, "kl": 0.009691339917480946, "learning_rate": 1.5836666666666666e-06, "loss": 0.0005, "num_tokens": 1552525.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014464963169302791, "kl": 4.537403583526611e-06, "learning_rate": 1.5833333333333333e-06, "loss": 0.0, "num_tokens": 1552745.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 97.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.011426261626183987, "kl": 0.01420259429141879, "learning_rate": 1.583e-06, "loss": 0.0007, "num_tokens": 1553005.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 97.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.811831474304199, "kl": 0.12035173550248146, "learning_rate": 1.5826666666666667e-06, "loss": 0.0109, "num_tokens": 1553334.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 97.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0537535697221756, "kl": 0.009263205574825406, "learning_rate": 1.5823333333333335e-06, "loss": 0.0005, "num_tokens": 1553662.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.012192741967737675, "kl": 0.008494176901876926, "learning_rate": 1.582e-06, "loss": 0.0004, "num_tokens": 1553934.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 97.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.01998489536345005, "kl": 0.0010051537537947297, "learning_rate": 1.5816666666666668e-06, "loss": 0.0001, "num_tokens": 1554230.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 97.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05653964355587959, "kl": 0.012128156144171953, "learning_rate": 1.5813333333333332e-06, "loss": 0.0007, "num_tokens": 1554567.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.030683143064379692, "kl": 0.0006210058927536011, "learning_rate": 1.5810000000000002e-06, "loss": 0.0, "num_tokens": 1554779.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 97.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 1.8385437726974487, "kl": 0.21021639555692673, "learning_rate": 1.5806666666666666e-06, "loss": 0.0116, "num_tokens": 1555119.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 97.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.064893439412117, "kl": 0.0026374012231826782, "learning_rate": 1.5803333333333333e-06, "loss": 0.0001, "num_tokens": 1555331.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 97.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.3022162914276123, "kl": 0.3044265806674957, "learning_rate": 1.58e-06, "loss": -0.0696, "num_tokens": 1555697.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 5261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 97.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.00520613556727767, "kl": 0.0013129889848642051, "learning_rate": 1.5796666666666667e-06, "loss": 0.0001, "num_tokens": 1555957.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 97.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.4643997848033905, "kl": 0.041804001142736524, "learning_rate": 1.5793333333333335e-06, "loss": 0.0024, "num_tokens": 1556279.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 97.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03763243183493614, "kl": 0.015672972425818443, "learning_rate": 1.579e-06, "loss": 0.0007, "num_tokens": 1556631.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 97.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.019132385030388832, "kl": 0.265506386756897, "learning_rate": 1.5786666666666668e-06, "loss": 0.0133, "num_tokens": 1556935.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 97.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09526564180850983, "kl": 0.016265312675386667, "learning_rate": 1.5783333333333334e-06, "loss": 0.0007, "num_tokens": 1557259.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 97.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07023649662733078, "kl": 0.006840124959126115, "learning_rate": 1.5780000000000002e-06, "loss": 0.0004, "num_tokens": 1557568.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 97.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.038880303502082825, "kl": 0.15783193707466125, "learning_rate": 1.5776666666666665e-06, "loss": 0.0079, "num_tokens": 1557878.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 97.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05422493815422058, "kl": 0.0028571193106472492, "learning_rate": 1.5773333333333333e-06, "loss": 0.0001, "num_tokens": 1558121.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 97.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06280789524316788, "kl": 0.009660904761403799, "learning_rate": 1.5769999999999999e-06, "loss": 0.0005, "num_tokens": 1558409.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03092641569674015, "kl": 0.0037699388340115547, "learning_rate": 1.5766666666666667e-06, "loss": 0.0002, "num_tokens": 1558693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 97.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0648389682173729, "kl": 0.10496380552649498, "learning_rate": 1.5763333333333335e-06, "loss": 0.0052, "num_tokens": 1559065.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 97.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.007291814312338829, "kl": 0.00016025701916078106, "learning_rate": 1.576e-06, "loss": 0.0, "num_tokens": 1559335.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008909560274332762, "kl": 0.003739573061466217, "learning_rate": 1.5756666666666668e-06, "loss": 0.0002, "num_tokens": 1559571.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 97.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 5.143528938293457, "kl": 0.029185396801040042, "learning_rate": 1.5753333333333334e-06, "loss": 0.1239, "num_tokens": 1559838.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 97.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.853620171546936, "kl": 0.10425485437735915, "learning_rate": 1.5750000000000002e-06, "loss": 0.0036, "num_tokens": 1560158.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.19300009310245514, "kl": 0.027175567112863064, "learning_rate": 1.5746666666666665e-06, "loss": 0.0014, "num_tokens": 1560441.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 97.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.06380121409893036, "kl": 0.0016522258520126343, "learning_rate": 1.5743333333333333e-06, "loss": 0.0001, "num_tokens": 1560701.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 97.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 10.099404335021973, "kl": 0.03659984492696822, "learning_rate": 1.5739999999999999e-06, "loss": 0.0654, "num_tokens": 1560976.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 97.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.019011495634913445, "kl": 0.0007260367274284363, "learning_rate": 1.5736666666666667e-06, "loss": 0.0, "num_tokens": 1561288.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 97.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.010913548059761524, "kl": 0.00045352215238381177, "learning_rate": 1.5733333333333334e-06, "loss": 0.0, "num_tokens": 1561602.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 97.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.052574340254068375, "kl": 0.015412142500281334, "learning_rate": 1.573e-06, "loss": 0.0008, "num_tokens": 1561904.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0987841784954071, "kl": 0.003970506833866239, "learning_rate": 1.5726666666666668e-06, "loss": 0.0002, "num_tokens": 1562117.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 97.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.21417739987373352, "kl": 0.018260984565131366, "learning_rate": 1.5723333333333334e-06, "loss": 0.0011, "num_tokens": 1562355.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02639508806169033, "kl": 0.0065969196148216724, "learning_rate": 1.5720000000000002e-06, "loss": 0.0003, "num_tokens": 1562647.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 97.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.2457984685897827, "kl": 0.4335959553718567, "learning_rate": 1.5716666666666665e-06, "loss": -0.0131, "num_tokens": 1563047.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 5286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 97.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.001239181961864233, "kl": 0.0012553312699310482, "learning_rate": 1.5713333333333333e-06, "loss": 0.0001, "num_tokens": 1563327.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 97.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 5.417866230010986, "kl": 0.11619619559496641, "learning_rate": 1.5709999999999999e-06, "loss": 0.1439, "num_tokens": 1563672.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 5288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 97.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.01846601627767086, "kl": 0.002122808597050607, "learning_rate": 1.5706666666666666e-06, "loss": 0.0001, "num_tokens": 1563940.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.25, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 97.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.5261380672454834, "kl": 0.006361888954415917, "learning_rate": 1.5703333333333334e-06, "loss": 0.4192, "num_tokens": 1564365.0, "reward": 7.425000190734863, "reward_std": 0.15000009536743164, "rewards/reward_combined/mean": 7.425000190734863, "rewards/reward_combined/std": 0.15000009536743164, "step": 5290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.052766088396310806, "kl": 0.0031872778199613094, "learning_rate": 1.57e-06, "loss": 0.0001, "num_tokens": 1564582.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.014063029550015926, "kl": 0.00013606548600364476, "learning_rate": 1.5696666666666668e-06, "loss": 0.0, "num_tokens": 1564838.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.024823691695928574, "kl": 0.00239275477360934, "learning_rate": 1.5693333333333334e-06, "loss": 0.0001, "num_tokens": 1565136.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 98.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03821438178420067, "kl": 0.03898667357861996, "learning_rate": 1.5690000000000001e-06, "loss": 0.0019, "num_tokens": 1565540.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 98.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04019486531615257, "kl": 0.005967809120193124, "learning_rate": 1.5686666666666665e-06, "loss": 0.0003, "num_tokens": 1565867.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 98.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.191434383392334, "kl": 0.09390311315655708, "learning_rate": 1.5683333333333335e-06, "loss": -0.0424, "num_tokens": 1566195.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 5296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06939195841550827, "kl": 0.0035546133294701576, "learning_rate": 1.5679999999999999e-06, "loss": 0.0002, "num_tokens": 1566459.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 98.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.037322599440813065, "kl": 0.09946007654070854, "learning_rate": 1.5676666666666666e-06, "loss": 0.005, "num_tokens": 1566831.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.016945242881774902, "kl": 0.005102618131786585, "learning_rate": 1.5673333333333334e-06, "loss": 0.0003, "num_tokens": 1567099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 98.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02787545695900917, "kl": 0.0010244142613373697, "learning_rate": 1.567e-06, "loss": 0.0001, "num_tokens": 1567423.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.1003614068031311, "kl": 0.009341378579847515, "learning_rate": 1.5666666666666668e-06, "loss": 0.0005, "num_tokens": 1567725.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 98.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.02863270603120327, "kl": 0.004132440779358149, "learning_rate": 1.5663333333333333e-06, "loss": 0.0002, "num_tokens": 1568050.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 98.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.11757971346378326, "kl": 0.016049266327172518, "learning_rate": 1.5660000000000001e-06, "loss": 0.0008, "num_tokens": 1568341.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 98.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07015055418014526, "kl": 0.0019031152478419244, "learning_rate": 1.5656666666666665e-06, "loss": 0.0001, "num_tokens": 1568574.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.013869311660528183, "kl": 0.0009351014741696417, "learning_rate": 1.5653333333333335e-06, "loss": 0.0, "num_tokens": 1568834.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 98.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.18689417839050293, "kl": 0.026396608911454678, "learning_rate": 1.5649999999999998e-06, "loss": 0.0012, "num_tokens": 1569172.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 98.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03671710565686226, "kl": 0.004010175005532801, "learning_rate": 1.5646666666666666e-06, "loss": 0.0002, "num_tokens": 1569461.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 98.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615633949637413, "kl": 0.004450254142284393, "learning_rate": 1.5643333333333334e-06, "loss": 0.0002, "num_tokens": 1569677.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.1403944343328476, "kl": 0.008808583690552041, "learning_rate": 1.564e-06, "loss": 0.0004, "num_tokens": 1569951.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 98.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014764757361263037, "kl": 4.641711711883545e-06, "learning_rate": 1.5636666666666668e-06, "loss": 0.0, "num_tokens": 1570171.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 98.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1486303061246872, "kl": 0.02301202341914177, "learning_rate": 1.5633333333333333e-06, "loss": 0.0012, "num_tokens": 1570482.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 98.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05841120705008507, "kl": 0.02735324203968048, "learning_rate": 1.5630000000000001e-06, "loss": 0.0014, "num_tokens": 1570874.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 98.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.23246394097805023, "kl": 0.04803896322846413, "learning_rate": 1.5626666666666665e-06, "loss": 0.0023, "num_tokens": 1571188.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 98.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.008211187086999416, "kl": 0.0007003595528658479, "learning_rate": 1.5623333333333335e-06, "loss": 0.0, "num_tokens": 1571408.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 7.275888919830322, "kl": 0.02710882108658552, "learning_rate": 1.5620000000000002e-06, "loss": 0.2353, "num_tokens": 1571703.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 98.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.344841003417969, "kl": 0.6561076119542122, "learning_rate": 1.5616666666666666e-06, "loss": -0.2573, "num_tokens": 1571999.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 5316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 71.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 71.5, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.028782606124878, "kl": 0.008606459014117718, "learning_rate": 1.5613333333333336e-06, "loss": 0.4631, "num_tokens": 1572505.0, "reward": 7.300000190734863, "reward_std": 0.40000009536743164, "rewards/reward_combined/mean": 7.300000190734863, "rewards/reward_combined/std": 0.40000009536743164, "step": 5317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 98.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.8740912079811096, "kl": 0.11713118478655815, "learning_rate": 1.561e-06, "loss": 0.0064, "num_tokens": 1572838.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 98.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.006007605232298374, "kl": 0.0003809332847595215, "learning_rate": 1.5606666666666667e-06, "loss": 0.0, "num_tokens": 1573098.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.012878204695880413, "kl": 0.0017801049398258328, "learning_rate": 1.5603333333333333e-06, "loss": 0.0001, "num_tokens": 1573375.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 98.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.023811189457774162, "kl": 0.2646760046482086, "learning_rate": 1.56e-06, "loss": 0.0132, "num_tokens": 1573679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 98.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.006480825133621693, "kl": 0.00012418627738952637, "learning_rate": 1.5596666666666667e-06, "loss": 0.0, "num_tokens": 1573891.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 98.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.5577068328857422, "kl": 0.049407415091991425, "learning_rate": 1.5593333333333335e-06, "loss": 0.0597, "num_tokens": 1574239.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.009676500223577023, "kl": 0.0005857936921529472, "learning_rate": 1.5590000000000002e-06, "loss": 0.0, "num_tokens": 1574501.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 98.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.012871612794697285, "kl": 0.0005100475536892191, "learning_rate": 1.5586666666666666e-06, "loss": 0.0, "num_tokens": 1574810.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.6796942353248596, "kl": 0.0760381855070591, "learning_rate": 1.5583333333333336e-06, "loss": 0.0036, "num_tokens": 1575090.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.058911386877298355, "kl": 0.0014866202618577518, "learning_rate": 1.558e-06, "loss": 0.0001, "num_tokens": 1575346.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 98.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008437736541964114, "kl": 0.003752976655960083, "learning_rate": 1.5576666666666667e-06, "loss": 0.0002, "num_tokens": 1575582.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 98.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.002823119517415762, "kl": 0.00024881362332962453, "learning_rate": 1.5573333333333333e-06, "loss": 0.0, "num_tokens": 1575802.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 98.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 4.0280938148498535, "kl": 0.22838620003312826, "learning_rate": 1.557e-06, "loss": 0.0386, "num_tokens": 1576111.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 98.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04869832843542099, "kl": 0.020665702410042286, "learning_rate": 1.5566666666666667e-06, "loss": 0.0009, "num_tokens": 1576464.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 98.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.08788575232028961, "kl": 0.0032032057642936707, "learning_rate": 1.5563333333333334e-06, "loss": 0.0002, "num_tokens": 1576674.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016967543633654714, "kl": 0.0001244927480001934, "learning_rate": 1.5560000000000002e-06, "loss": 0.0, "num_tokens": 1576944.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 98.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.604962348937988, "kl": 0.016524864826351404, "learning_rate": 1.5556666666666666e-06, "loss": 0.0472, "num_tokens": 1577206.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 98.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.3547906279563904, "kl": 0.05260276701301336, "learning_rate": 1.5553333333333336e-06, "loss": 0.0031, "num_tokens": 1577487.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 98.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.6202030181884766, "kl": 0.04861114360392094, "learning_rate": 1.555e-06, "loss": 0.062, "num_tokens": 1577849.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 5336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 98.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.3343932330608368, "kl": 0.022482444532215595, "learning_rate": 1.5546666666666667e-06, "loss": 0.001, "num_tokens": 1578113.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 98.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05401170626282692, "kl": 0.0022344777826219797, "learning_rate": 1.5543333333333333e-06, "loss": 0.0001, "num_tokens": 1578362.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.022278184071183205, "kl": 0.003950534504838288, "learning_rate": 1.554e-06, "loss": 0.0002, "num_tokens": 1578654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07880939543247223, "kl": 0.057072628289461136, "learning_rate": 1.5536666666666666e-06, "loss": 0.0029, "num_tokens": 1578945.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 98.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.069204092025757, "kl": 0.09081784635782242, "learning_rate": 1.5533333333333334e-06, "loss": -0.0611, "num_tokens": 1579317.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 5341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 98.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01980489492416382, "kl": 0.002585027366876602, "learning_rate": 1.5530000000000002e-06, "loss": 0.0001, "num_tokens": 1579629.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.031025422737002373, "kl": 0.003615888301283121, "learning_rate": 1.5526666666666668e-06, "loss": 0.0002, "num_tokens": 1579913.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 98.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02114158309996128, "kl": 0.004775496083311737, "learning_rate": 1.5523333333333336e-06, "loss": 0.0002, "num_tokens": 1580247.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07181570678949356, "kl": 0.01126834750175476, "learning_rate": 1.552e-06, "loss": 0.0006, "num_tokens": 1580521.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 99.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06870504468679428, "kl": 0.010243732016533613, "learning_rate": 1.5516666666666667e-06, "loss": 0.0005, "num_tokens": 1580848.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 99.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.24986718595027924, "kl": 0.011883744155056775, "learning_rate": 1.5513333333333333e-06, "loss": 0.0006, "num_tokens": 1581119.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 99.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.053214896470308304, "kl": 0.011505658272653818, "learning_rate": 1.551e-06, "loss": 0.0006, "num_tokens": 1581413.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 99.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.103551149368286, "kl": 0.1987457387149334, "learning_rate": 1.5506666666666666e-06, "loss": 0.1555, "num_tokens": 1581762.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 99.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.17072442173957825, "kl": 0.021500190254300833, "learning_rate": 1.5503333333333334e-06, "loss": 0.0011, "num_tokens": 1582096.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 99.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.09728353470563889, "kl": 0.027855553664267063, "learning_rate": 1.5500000000000002e-06, "loss": 0.0014, "num_tokens": 1582367.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 99.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.540256977081299, "kl": 0.20179709047079086, "learning_rate": 1.5496666666666668e-06, "loss": -0.0031, "num_tokens": 1582670.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 99.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05950898304581642, "kl": 0.007303935009986162, "learning_rate": 1.5493333333333335e-06, "loss": 0.0004, "num_tokens": 1583003.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 99.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04495961219072342, "kl": 0.0029592177888844162, "learning_rate": 1.549e-06, "loss": 0.0001, "num_tokens": 1583271.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 99.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.015279130078852177, "kl": 0.0010715940152294934, "learning_rate": 1.5486666666666667e-06, "loss": 0.0001, "num_tokens": 1583539.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 99.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.06639955937862396, "kl": 0.017613645642995834, "learning_rate": 1.5483333333333333e-06, "loss": 0.0011, "num_tokens": 1583898.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 99.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.02236146852374077, "kl": 0.001061448361724615, "learning_rate": 1.548e-06, "loss": 0.0001, "num_tokens": 1584167.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 99.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04704895615577698, "kl": 0.0005344539822544903, "learning_rate": 1.5476666666666666e-06, "loss": 0.0, "num_tokens": 1584381.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 99.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.05783044919371605, "kl": 0.001563534140586853, "learning_rate": 1.5473333333333334e-06, "loss": 0.0001, "num_tokens": 1584591.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 99.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.09060274809598923, "kl": 0.009187803603708744, "learning_rate": 1.5470000000000002e-06, "loss": 0.0005, "num_tokens": 1584923.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 99.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.18047532439231873, "kl": 0.03346627578139305, "learning_rate": 1.5466666666666668e-06, "loss": 0.0017, "num_tokens": 1585221.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 99.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.15399408340454102, "kl": 0.03782643564045429, "learning_rate": 1.5463333333333335e-06, "loss": 0.0019, "num_tokens": 1585544.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 99.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.3413825035095215, "kl": 0.05090123228728771, "learning_rate": 1.5459999999999999e-06, "loss": 0.0996, "num_tokens": 1585894.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 99.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.025552954524755478, "kl": 0.002415069960989058, "learning_rate": 1.5456666666666669e-06, "loss": 0.0001, "num_tokens": 1586154.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 99.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04145500436425209, "kl": 0.005083933472633362, "learning_rate": 1.5453333333333332e-06, "loss": 0.0003, "num_tokens": 1586370.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 99.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.010239742696285248, "kl": 0.00029083655681461096, "learning_rate": 1.545e-06, "loss": 0.0, "num_tokens": 1586686.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 99.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05068647861480713, "kl": 0.004774346947669983, "learning_rate": 1.5446666666666666e-06, "loss": 0.0002, "num_tokens": 1586990.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 99.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018730225274339318, "kl": 0.0003124594804830849, "learning_rate": 1.5443333333333334e-06, "loss": 0.0, "num_tokens": 1587210.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 99.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011681350588332862, "kl": 4.000961780548096e-06, "learning_rate": 1.5440000000000002e-06, "loss": 0.0, "num_tokens": 1587430.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 99.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.01794993132352829, "kl": 0.0023926477879285812, "learning_rate": 1.5436666666666667e-06, "loss": 0.0001, "num_tokens": 1587742.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 99.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02146325074136257, "kl": 0.0062999005895107985, "learning_rate": 1.5433333333333335e-06, "loss": 0.0003, "num_tokens": 1588010.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 99.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08506253361701965, "kl": 0.01793564297258854, "learning_rate": 1.5429999999999999e-06, "loss": 0.001, "num_tokens": 1588292.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 99.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.055309124290943146, "kl": 0.002217214263509959, "learning_rate": 1.5426666666666669e-06, "loss": 0.0001, "num_tokens": 1588611.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 99.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05325845628976822, "kl": 0.002905784174799919, "learning_rate": 1.5423333333333332e-06, "loss": 0.0001, "num_tokens": 1588860.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 99.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.029673736542463303, "kl": 0.001128291798522696, "learning_rate": 1.542e-06, "loss": 0.0001, "num_tokens": 1589093.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 99.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.022981755435466766, "kl": 0.09596388041973114, "learning_rate": 1.5416666666666666e-06, "loss": 0.0048, "num_tokens": 1589465.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 99.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.841578960418701, "kl": 0.24588903784751892, "learning_rate": 1.5413333333333334e-06, "loss": 0.094, "num_tokens": 1589781.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 99.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.19184766709804535, "kl": 0.015550390351563692, "learning_rate": 1.5410000000000002e-06, "loss": 0.0009, "num_tokens": 1590083.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 99.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.066701889038086, "kl": 0.007523627951741219, "learning_rate": 1.5406666666666667e-06, "loss": 0.033, "num_tokens": 1590420.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5379 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 99.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.120488405227661, "kl": 0.07349224388599396, "learning_rate": 1.5403333333333335e-06, "loss": 0.0053, "num_tokens": 1590800.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 99.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007782155880704522, "kl": 0.003766007721424103, "learning_rate": 1.5399999999999999e-06, "loss": 0.0002, "num_tokens": 1591036.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 99.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 5.380682945251465, "kl": 0.12924740463495255, "learning_rate": 1.5396666666666669e-06, "loss": 0.1347, "num_tokens": 1591392.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 5382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 99.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03189317137002945, "kl": 0.001747717848047614, "learning_rate": 1.5393333333333332e-06, "loss": 0.0001, "num_tokens": 1591695.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 99.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.045159101486206055, "kl": 0.007392449617327657, "learning_rate": 1.539e-06, "loss": 0.0004, "num_tokens": 1591965.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 99.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.013723954558372498, "kl": 0.013721433002501726, "learning_rate": 1.5386666666666666e-06, "loss": 0.0007, "num_tokens": 1592225.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 99.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.15103651583194733, "kl": 0.03804316185414791, "learning_rate": 1.5383333333333334e-06, "loss": 0.0019, "num_tokens": 1592536.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 99.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.16732698678970337, "kl": 0.017702241544611752, "learning_rate": 1.5380000000000001e-06, "loss": 0.0009, "num_tokens": 1592826.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 94.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 94.25, "completions/mean_terminated_length": 40.333335876464844, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 99.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.7212657928466797, "kl": 0.030908092856407166, "learning_rate": 1.5376666666666667e-06, "loss": 0.3665, "num_tokens": 1593427.0, "reward": 5.425000190734863, "reward_std": 4.149999618530273, "rewards/reward_combined/mean": 5.425000190734863, "rewards/reward_combined/std": 4.150000095367432, "step": 5388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 99.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.09785301238298416, "kl": 0.013820950407534838, "learning_rate": 1.5373333333333335e-06, "loss": 0.0007, "num_tokens": 1593719.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 99.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 2.9454638957977295, "kl": 0.44942344445735216, "learning_rate": 1.537e-06, "loss": 0.0232, "num_tokens": 1594008.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 99.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0752691999077797, "kl": 0.07165481522679329, "learning_rate": 1.5366666666666668e-06, "loss": 0.0036, "num_tokens": 1594302.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 99.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 8.373311042785645, "kl": 0.015349007211625576, "learning_rate": 1.5363333333333332e-06, "loss": 0.1194, "num_tokens": 1594631.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 99.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02674560621380806, "kl": 0.0014183521270751953, "learning_rate": 1.536e-06, "loss": 0.0001, "num_tokens": 1594843.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 99.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.023457389324903488, "kl": 0.0003246545675210655, "learning_rate": 1.5356666666666666e-06, "loss": 0.0, "num_tokens": 1595099.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 99.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.5527740120887756, "kl": 0.0588977187871933, "learning_rate": 1.5353333333333333e-06, "loss": 0.0036, "num_tokens": 1595391.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 72.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 72.0, "completions/mean_terminated_length": 10.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 99.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.405885100364685, "kl": 0.015394964255392551, "learning_rate": 1.5350000000000001e-06, "loss": 0.4571, "num_tokens": 1595899.0, "reward": 6.300000190734863, "reward_std": 2.4000000953674316, "rewards/reward_combined/mean": 6.300000190734863, "rewards/reward_combined/std": 2.3999998569488525, "step": 5396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 99.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008264186908490956, "kl": 0.0011939768446609378, "learning_rate": 1.5346666666666667e-06, "loss": 0.0001, "num_tokens": 1596179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 99.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.005415427964180708, "kl": 0.00031583383679389954, "learning_rate": 1.5343333333333335e-06, "loss": 0.0, "num_tokens": 1596439.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 99.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.036006297916173935, "kl": 0.031755766831338406, "learning_rate": 1.534e-06, "loss": 0.0015, "num_tokens": 1596855.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 100.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08657006919384003, "kl": 0.007303065387532115, "learning_rate": 1.5336666666666668e-06, "loss": 0.0003, "num_tokens": 1597115.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 100.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.492645740509033, "kl": 0.27295850962400436, "learning_rate": 1.5333333333333332e-06, "loss": 0.0208, "num_tokens": 1597423.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 100.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.037336818873882294, "kl": 0.09904588013887405, "learning_rate": 1.533e-06, "loss": 0.005, "num_tokens": 1597795.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 100.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03178287670016289, "kl": 0.00783985760062933, "learning_rate": 1.5326666666666665e-06, "loss": 0.0004, "num_tokens": 1598118.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 100.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.08238870650529861, "kl": 0.016373123042285442, "learning_rate": 1.5323333333333333e-06, "loss": 0.0008, "num_tokens": 1598411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 100.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07624446600675583, "kl": 0.0022464243702415843, "learning_rate": 1.5320000000000001e-06, "loss": 0.0001, "num_tokens": 1598668.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 100.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06564126163721085, "kl": 0.006683208514004946, "learning_rate": 1.5316666666666667e-06, "loss": 0.0003, "num_tokens": 1598995.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 100.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08014131337404251, "kl": 0.017301190178841352, "learning_rate": 1.5313333333333335e-06, "loss": 0.0009, "num_tokens": 1599301.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 100.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.19261673092842102, "kl": 0.0203330940566957, "learning_rate": 1.531e-06, "loss": 0.0011, "num_tokens": 1599591.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 100.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.031339049339294434, "kl": 0.0016976014303509146, "learning_rate": 1.5306666666666668e-06, "loss": 0.0001, "num_tokens": 1599810.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 100.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.382539987564087, "kl": 0.04858838557265699, "learning_rate": 1.5303333333333332e-06, "loss": 0.1232, "num_tokens": 1600154.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 100.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.7690751552581787, "kl": 0.10870247334241867, "learning_rate": 1.53e-06, "loss": 0.0103, "num_tokens": 1600536.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 100.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.00813469011336565, "kl": 0.0014763634535484016, "learning_rate": 1.5296666666666665e-06, "loss": 0.0001, "num_tokens": 1600810.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 100.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.38398101925849915, "kl": 0.03935919562354684, "learning_rate": 1.5293333333333333e-06, "loss": 0.0022, "num_tokens": 1601113.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 100.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.05523664131760597, "kl": 0.0021810964099131525, "learning_rate": 1.529e-06, "loss": 0.0001, "num_tokens": 1601379.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 100.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.057254012674093246, "kl": 0.01424642140045762, "learning_rate": 1.5286666666666667e-06, "loss": 0.0008, "num_tokens": 1601653.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 100.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.035123929381370544, "kl": 0.0009035170078277588, "learning_rate": 1.5283333333333335e-06, "loss": 0.0, "num_tokens": 1601861.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 100.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0074735041707754135, "kl": 0.0020648986101150513, "learning_rate": 1.528e-06, "loss": 0.0001, "num_tokens": 1602077.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 100.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.010329063981771469, "kl": 0.008671910502016544, "learning_rate": 1.5276666666666668e-06, "loss": 0.0004, "num_tokens": 1602349.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 100.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005601013079285622, "kl": 0.26794689893722534, "learning_rate": 1.5273333333333332e-06, "loss": 0.0134, "num_tokens": 1602653.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 100.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03041202202439308, "kl": 0.0010525789693929255, "learning_rate": 1.5270000000000002e-06, "loss": 0.0001, "num_tokens": 1602970.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 100.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03022598661482334, "kl": 0.003677847096696496, "learning_rate": 1.5266666666666665e-06, "loss": 0.0002, "num_tokens": 1603254.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 100.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005837862379848957, "kl": 0.00020234286785125732, "learning_rate": 1.5263333333333333e-06, "loss": 0.0, "num_tokens": 1603466.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 100.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.27575093507766724, "kl": 0.021130628883838654, "learning_rate": 1.526e-06, "loss": 0.001, "num_tokens": 1603801.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 100.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.011377870105206966, "kl": 0.0034642955870367587, "learning_rate": 1.5256666666666667e-06, "loss": 0.0001, "num_tokens": 1604099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 100.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00850125215947628, "kl": 0.00019849191448884085, "learning_rate": 1.5253333333333334e-06, "loss": 0.0, "num_tokens": 1604367.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 100.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004798910580575466, "kl": 0.0003161365748383105, "learning_rate": 1.525e-06, "loss": 0.0, "num_tokens": 1604681.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 100.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05713745951652527, "kl": 0.019365067593753338, "learning_rate": 1.5246666666666668e-06, "loss": 0.001, "num_tokens": 1605039.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 100.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.030812595039606094, "kl": 0.05632269196212292, "learning_rate": 1.5243333333333332e-06, "loss": 0.0028, "num_tokens": 1605332.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 100.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022928824182599783, "kl": 0.0015261415392160416, "learning_rate": 1.5240000000000001e-06, "loss": 0.0001, "num_tokens": 1605644.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 100.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.002207911340519786, "kl": 6.789564940845594e-05, "learning_rate": 1.5236666666666665e-06, "loss": 0.0, "num_tokens": 1605904.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 100.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07424936443567276, "kl": 0.009732466656714678, "learning_rate": 1.5233333333333333e-06, "loss": 0.0005, "num_tokens": 1606195.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 100.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05747784674167633, "kl": 0.008547557983547449, "learning_rate": 1.523e-06, "loss": 0.0004, "num_tokens": 1606479.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 100.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.031948383897542953, "kl": 0.025077415630221367, "learning_rate": 1.5226666666666666e-06, "loss": 0.001, "num_tokens": 1606804.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 100.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02988605387508869, "kl": 0.0009293212206102908, "learning_rate": 1.5223333333333334e-06, "loss": 0.0, "num_tokens": 1607074.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 100.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.043994903564453, "kl": 0.031462740153074265, "learning_rate": 1.522e-06, "loss": 0.1026, "num_tokens": 1607408.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 100.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.007574048358947039, "kl": 0.0003867149353027344, "learning_rate": 1.5216666666666668e-06, "loss": 0.0, "num_tokens": 1607644.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 100.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.14587850868701935, "kl": 0.041430942714214325, "learning_rate": 1.5213333333333331e-06, "loss": 0.002, "num_tokens": 1607988.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 100.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0616946816444397, "kl": 0.0038246663461904973, "learning_rate": 1.5210000000000001e-06, "loss": 0.0002, "num_tokens": 1608254.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 100.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014803546480834484, "kl": 0.0016542524099349976, "learning_rate": 1.520666666666667e-06, "loss": 0.0001, "num_tokens": 1608514.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 100.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.19577500224113464, "kl": 0.006077451631426811, "learning_rate": 1.5203333333333333e-06, "loss": 0.0003, "num_tokens": 1608788.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 100.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.012572052888572216, "kl": 0.00023206590049085207, "learning_rate": 1.5200000000000003e-06, "loss": 0.0, "num_tokens": 1609030.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 100.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 6.121943950653076, "kl": 0.029760083183646202, "learning_rate": 1.5196666666666666e-06, "loss": 0.1309, "num_tokens": 1609351.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 100.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007620318792760372, "kl": 0.0037682130932807922, "learning_rate": 1.5193333333333334e-06, "loss": 0.0002, "num_tokens": 1609587.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 100.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.010014322586357594, "kl": 0.0401719119399786, "learning_rate": 1.519e-06, "loss": 0.002, "num_tokens": 1609992.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 100.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.027753768488764763, "kl": 0.0012380480766296387, "learning_rate": 1.5186666666666668e-06, "loss": 0.0001, "num_tokens": 1610208.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 100.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003488502698019147, "kl": 0.00019267946481704712, "learning_rate": 1.5183333333333333e-06, "loss": 0.0, "num_tokens": 1610428.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 100.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.728623628616333, "kl": 0.017682242207229137, "learning_rate": 1.5180000000000001e-06, "loss": 0.0281, "num_tokens": 1610689.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 100.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.021002013236284256, "kl": 0.0006055720150470734, "learning_rate": 1.517666666666667e-06, "loss": 0.0, "num_tokens": 1610949.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 100.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09916306287050247, "kl": 0.01735564274713397, "learning_rate": 1.5173333333333333e-06, "loss": 0.0009, "num_tokens": 1611282.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 100.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.029490530490875244, "kl": 0.0021016259561292827, "learning_rate": 1.5170000000000003e-06, "loss": 0.0001, "num_tokens": 1611578.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006172839552164078, "clip_ratio/low_min": 0.006172839552164078, "clip_ratio/region_mean": 0.006172839552164078, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 100.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.9005918502807617, "kl": 0.06473296135663986, "learning_rate": 1.5166666666666666e-06, "loss": 0.2028, "num_tokens": 1611943.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 5451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 100.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.7278189659118652, "kl": 0.07189453579485416, "learning_rate": 1.5163333333333334e-06, "loss": 0.0153, "num_tokens": 1612274.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 100.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.20069493353366852, "kl": 0.03770335204899311, "learning_rate": 1.516e-06, "loss": 0.0019, "num_tokens": 1612546.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.8044025897979736, "kl": 0.10362424701452255, "learning_rate": 1.5156666666666668e-06, "loss": 0.0066, "num_tokens": 1612826.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 101.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.19437113404273987, "kl": 0.030902760103344917, "learning_rate": 1.5153333333333333e-06, "loss": 0.0016, "num_tokens": 1613151.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020378935150802135, "kl": 3.0475853236566763e-05, "learning_rate": 1.5150000000000001e-06, "loss": 0.0, "num_tokens": 1613411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 101.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.0847392082214355, "kl": 0.07134226709604263, "learning_rate": 1.5146666666666669e-06, "loss": 0.1019, "num_tokens": 1613755.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.051703307777643204, "kl": 0.014781441539525986, "learning_rate": 1.5143333333333332e-06, "loss": 0.0008, "num_tokens": 1614037.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 101.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.2231585830450058, "kl": 0.019413352943956852, "learning_rate": 1.5140000000000002e-06, "loss": 0.001, "num_tokens": 1614393.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 101.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06328113377094269, "kl": 0.005902788136154413, "learning_rate": 1.5136666666666666e-06, "loss": 0.0003, "num_tokens": 1614697.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 101.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.025488771498203278, "kl": 0.0013407915830612183, "learning_rate": 1.5133333333333334e-06, "loss": 0.0001, "num_tokens": 1614909.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 101.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.028874771669507027, "kl": 0.00027207285165786743, "learning_rate": 1.513e-06, "loss": 0.0, "num_tokens": 1615121.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.057352304458618164, "kl": 0.02362719837401528, "learning_rate": 1.5126666666666667e-06, "loss": 0.0012, "num_tokens": 1615408.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 101.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.1560592651367188, "kl": 0.024163642898201942, "learning_rate": 1.5123333333333333e-06, "loss": 0.0745, "num_tokens": 1615782.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 101.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.021252160891890526, "kl": 0.006311272969469428, "learning_rate": 1.512e-06, "loss": 0.0003, "num_tokens": 1616050.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 101.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.025522353127598763, "kl": 0.0011507653980515897, "learning_rate": 1.5116666666666669e-06, "loss": 0.0001, "num_tokens": 1616362.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 101.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.016603050753474236, "kl": 0.0029182470170781016, "learning_rate": 1.5113333333333334e-06, "loss": 0.0001, "num_tokens": 1616690.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 101.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07513971626758575, "kl": 0.006240957882255316, "learning_rate": 1.5110000000000002e-06, "loss": 0.0003, "num_tokens": 1616964.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 101.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.013259727507829666, "kl": 0.0006940088205737993, "learning_rate": 1.5106666666666666e-06, "loss": 0.0, "num_tokens": 1617199.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.14239011704921722, "kl": 0.04465408995747566, "learning_rate": 1.5103333333333334e-06, "loss": 0.0022, "num_tokens": 1617532.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 101.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035039009526371956, "kl": 0.04595787823200226, "learning_rate": 1.51e-06, "loss": 0.0023, "num_tokens": 1617936.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 101.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.07039260864257812, "kl": 0.015040764585137367, "learning_rate": 1.5096666666666667e-06, "loss": 0.0008, "num_tokens": 1618258.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 101.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 2.7471505745779723e-05, "kl": 3.255903720855713e-06, "learning_rate": 1.5093333333333333e-06, "loss": 0.0, "num_tokens": 1618478.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 101.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00926927849650383, "kl": 0.0001908615231513977, "learning_rate": 1.509e-06, "loss": 0.0, "num_tokens": 1618686.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.3001217246055603, "kl": 0.041287238942459226, "learning_rate": 1.5086666666666669e-06, "loss": 0.0023, "num_tokens": 1618975.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 101.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.18405325710773468, "kl": 0.024043050594627857, "learning_rate": 1.5083333333333334e-06, "loss": 0.0015, "num_tokens": 1619263.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 101.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.034434910863637924, "kl": 0.09782170876860619, "learning_rate": 1.5080000000000002e-06, "loss": 0.0049, "num_tokens": 1619635.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.050050657242536545, "kl": 0.0026265646229148842, "learning_rate": 1.5076666666666666e-06, "loss": 0.0001, "num_tokens": 1619909.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 101.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.890697002410889, "kl": 0.012026506941765547, "learning_rate": 1.5073333333333334e-06, "loss": -0.0367, "num_tokens": 1620203.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 101.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.481655597686768, "kl": 0.03292984934523702, "learning_rate": 1.507e-06, "loss": 0.0584, "num_tokens": 1620512.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 5480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.009642007760703564, "kl": 0.00012345909635769203, "learning_rate": 1.5066666666666667e-06, "loss": 0.0, "num_tokens": 1620768.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 101.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0526387020945549, "kl": 0.0031842728203628212, "learning_rate": 1.5063333333333333e-06, "loss": 0.0002, "num_tokens": 1621066.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 101.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008404916152358055, "kl": 0.0022246912121772766, "learning_rate": 1.506e-06, "loss": 0.0001, "num_tokens": 1621282.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 101.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.026012161746621132, "kl": 0.16249799728393555, "learning_rate": 1.5056666666666668e-06, "loss": 0.0081, "num_tokens": 1621591.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.025419604033231735, "kl": 0.001505140564404428, "learning_rate": 1.5053333333333334e-06, "loss": 0.0001, "num_tokens": 1621862.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 101.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008624795591458678, "kl": 0.0037489011883735657, "learning_rate": 1.5050000000000002e-06, "loss": 0.0002, "num_tokens": 1622098.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 101.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 2.526792049407959, "kl": 0.44115081103518605, "learning_rate": 1.5046666666666666e-06, "loss": 0.0232, "num_tokens": 1622359.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 101.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009270765585824847, "kl": 0.0011186195188201964, "learning_rate": 1.5043333333333333e-06, "loss": 0.0001, "num_tokens": 1622639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 101.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.5725700855255127, "kl": 0.11916254088282585, "learning_rate": 1.504e-06, "loss": -0.0777, "num_tokens": 1623005.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 5489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 101.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.934995412826538, "kl": 0.03453459311276674, "learning_rate": 1.5036666666666667e-06, "loss": 0.0188, "num_tokens": 1623297.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 101.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07513511925935745, "kl": 0.0034808366326615214, "learning_rate": 1.5033333333333333e-06, "loss": 0.0002, "num_tokens": 1623563.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 101.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.11513255536556244, "kl": 0.03979503735899925, "learning_rate": 1.503e-06, "loss": 0.002, "num_tokens": 1623866.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 101.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 5.0089430809021, "kl": 0.012514011934399605, "learning_rate": 1.5026666666666668e-06, "loss": 0.1547, "num_tokens": 1624133.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 101.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.9709498286247253, "kl": 0.3347504287958145, "learning_rate": 1.5023333333333334e-06, "loss": 0.0168, "num_tokens": 1624438.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.037702690809965134, "kl": 0.0046277036890387535, "learning_rate": 1.5020000000000002e-06, "loss": 0.0002, "num_tokens": 1624734.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.13751274347305298, "kl": 0.012181914178654552, "learning_rate": 1.5016666666666665e-06, "loss": 0.0008, "num_tokens": 1625010.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.031027279794216156, "kl": 0.0036498650442808867, "learning_rate": 1.5013333333333335e-06, "loss": 0.0002, "num_tokens": 1625294.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 101.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0700439065694809, "kl": 0.009463974740356207, "learning_rate": 1.501e-06, "loss": 0.0005, "num_tokens": 1625560.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 101.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.2186272144317627, "kl": 0.042012871243059635, "learning_rate": 1.5006666666666667e-06, "loss": 0.027, "num_tokens": 1625892.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 101.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022304037120193243, "kl": 0.0001068115234375, "learning_rate": 1.5003333333333333e-06, "loss": 0.0, "num_tokens": 1626136.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 101.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02951614186167717, "kl": 0.004512539831921458, "learning_rate": 1.5e-06, "loss": 0.0002, "num_tokens": 1626424.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 101.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.035995204001665115, "kl": 0.003464370034635067, "learning_rate": 1.4996666666666666e-06, "loss": 0.0002, "num_tokens": 1626736.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 101.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01149026583880186, "kl": 0.0004345825727796182, "learning_rate": 1.4993333333333334e-06, "loss": 0.0, "num_tokens": 1627054.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 101.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.236963987350464, "kl": 0.07776482030749321, "learning_rate": 1.499e-06, "loss": -0.0867, "num_tokens": 1627423.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 101.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.055479470640420914, "kl": 0.011052647139877081, "learning_rate": 1.4986666666666665e-06, "loss": 0.0005, "num_tokens": 1627748.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 101.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.021672671660780907, "kl": 0.0016036659362725914, "learning_rate": 1.4983333333333335e-06, "loss": 0.0001, "num_tokens": 1627966.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 101.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 7.020152568817139, "kl": 0.04813184216618538, "learning_rate": 1.498e-06, "loss": -0.3337, "num_tokens": 1628278.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 102.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.006351388059556484, "kl": 0.00030978521681390703, "learning_rate": 1.4976666666666667e-06, "loss": 0.0, "num_tokens": 1628550.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 102.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04865812882781029, "kl": 0.03741905279457569, "learning_rate": 1.4973333333333335e-06, "loss": 0.0019, "num_tokens": 1628822.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 102.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.022691408172249794, "kl": 0.017181613482534885, "learning_rate": 1.497e-06, "loss": 0.0009, "num_tokens": 1629178.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 102.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03642177954316139, "kl": 0.0017040020320564508, "learning_rate": 1.4966666666666666e-06, "loss": 0.0001, "num_tokens": 1629449.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 102.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.01144977193325758, "kl": 0.00036843400448560715, "learning_rate": 1.4963333333333334e-06, "loss": 0.0, "num_tokens": 1629770.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 102.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.02119687758386135, "kl": 0.09590749070048332, "learning_rate": 1.496e-06, "loss": 0.0048, "num_tokens": 1630142.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 102.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036402358673512936, "kl": 0.0003831803915090859, "learning_rate": 1.4956666666666667e-06, "loss": 0.0, "num_tokens": 1630362.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 102.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.038713980466127396, "kl": 0.16312626004219055, "learning_rate": 1.4953333333333335e-06, "loss": 0.0082, "num_tokens": 1630671.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 102.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00778679084032774, "kl": 0.0016133278841152787, "learning_rate": 1.495e-06, "loss": 0.0001, "num_tokens": 1630931.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 102.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.05482223629951477, "kl": 0.006359761813655496, "learning_rate": 1.4946666666666667e-06, "loss": 0.0003, "num_tokens": 1631201.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 102.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 2.4725849628448486, "kl": 0.07971523702144623, "learning_rate": 1.4943333333333334e-06, "loss": -0.0011, "num_tokens": 1631589.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 102.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06893176585435867, "kl": 0.003924438817193732, "learning_rate": 1.494e-06, "loss": 0.0002, "num_tokens": 1631859.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 102.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.021440302953124046, "kl": 0.0018459950806573033, "learning_rate": 1.4936666666666666e-06, "loss": 0.0001, "num_tokens": 1632155.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 102.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 4.42820930480957, "kl": 0.025614461861550808, "learning_rate": 1.4933333333333334e-06, "loss": 0.2842, "num_tokens": 1632470.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 5521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 102.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0855054184794426, "kl": 0.014258846058510244, "learning_rate": 1.493e-06, "loss": 0.0007, "num_tokens": 1632794.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 102.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09472135454416275, "kl": 0.004273287137039006, "learning_rate": 1.4926666666666667e-06, "loss": 0.0002, "num_tokens": 1633064.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 102.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 1.9774796962738037, "kl": 0.05759480409324169, "learning_rate": 1.4923333333333335e-06, "loss": 0.0005, "num_tokens": 1633392.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 5524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 102.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 1.1560639142990112, "kl": 0.4193668905645609, "learning_rate": 1.492e-06, "loss": 0.0385, "num_tokens": 1633798.0, "reward": 2.25, "reward_std": 1.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.5, "step": 5525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 102.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.018021684139966965, "kl": 0.0005127191543579102, "learning_rate": 1.4916666666666666e-06, "loss": 0.0, "num_tokens": 1634010.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 102.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.020977525040507317, "kl": 0.005387601675465703, "learning_rate": 1.4913333333333334e-06, "loss": 0.0003, "num_tokens": 1634278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 102.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07715736329555511, "kl": 0.009784580208361149, "learning_rate": 1.491e-06, "loss": 0.0005, "num_tokens": 1634596.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 102.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.053244467824697495, "kl": 0.002207259414717555, "learning_rate": 1.4906666666666668e-06, "loss": 0.0001, "num_tokens": 1634845.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 102.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10468082129955292, "kl": 0.004478586371988058, "learning_rate": 1.4903333333333334e-06, "loss": 0.0003, "num_tokens": 1635055.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 102.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.025577636435627937, "kl": 0.0006268088181968778, "learning_rate": 1.49e-06, "loss": 0.0, "num_tokens": 1635288.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 102.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.018125461414456367, "kl": 0.05175800621509552, "learning_rate": 1.4896666666666667e-06, "loss": 0.0026, "num_tokens": 1635620.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 102.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.057896487414836884, "kl": 0.010228496976196766, "learning_rate": 1.4893333333333335e-06, "loss": 0.0005, "num_tokens": 1635880.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 102.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0621013268828392, "kl": 0.006157606840133667, "learning_rate": 1.489e-06, "loss": 0.0003, "num_tokens": 1636096.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 102.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.06662620604038239, "kl": 0.009493907913565636, "learning_rate": 1.4886666666666666e-06, "loss": 0.0005, "num_tokens": 1636430.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 102.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0063230544328689575, "kl": 0.0003096287546213716, "learning_rate": 1.4883333333333334e-06, "loss": 0.0, "num_tokens": 1636702.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 102.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.3630194664001465, "kl": 0.02920142188668251, "learning_rate": 1.488e-06, "loss": 0.0441, "num_tokens": 1637006.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 102.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.23937015235424042, "kl": 0.03838458959944546, "learning_rate": 1.4876666666666668e-06, "loss": 0.002, "num_tokens": 1637294.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 102.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.036702681332826614, "kl": 0.0050619977992028, "learning_rate": 1.4873333333333333e-06, "loss": 0.0002, "num_tokens": 1637591.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 102.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.224764108657837, "kl": 0.18924957513809204, "learning_rate": 1.487e-06, "loss": 0.0496, "num_tokens": 1637882.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 102.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.028212007135152817, "kl": 0.005184866953641176, "learning_rate": 1.4866666666666667e-06, "loss": 0.0003, "num_tokens": 1638173.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 102.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.010608435608446598, "kl": 0.008286285679787397, "learning_rate": 1.4863333333333335e-06, "loss": 0.0004, "num_tokens": 1638445.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 102.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 3.6796929634874687e-05, "kl": 2.518296241760254e-06, "learning_rate": 1.486e-06, "loss": 0.0, "num_tokens": 1638665.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 102.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008221440366469324, "kl": 0.003745429217815399, "learning_rate": 1.4856666666666668e-06, "loss": 0.0002, "num_tokens": 1638901.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 102.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.02928253449499607, "kl": 0.0031400781590491533, "learning_rate": 1.4853333333333334e-06, "loss": 0.0002, "num_tokens": 1639185.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 102.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.017769435420632362, "kl": 0.0002772510051727295, "learning_rate": 1.485e-06, "loss": 0.0, "num_tokens": 1639441.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 102.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.749978542327881, "kl": 0.02927328087389469, "learning_rate": 1.4846666666666668e-06, "loss": 0.1456, "num_tokens": 1639755.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 102.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.004367906134575605, "kl": 0.00016733705706428736, "learning_rate": 1.4843333333333333e-06, "loss": 0.0, "num_tokens": 1640015.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 102.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.06579401344060898, "kl": 0.03201588336378336, "learning_rate": 1.484e-06, "loss": 0.0017, "num_tokens": 1640361.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 102.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.021259495988488197, "kl": 0.008463053498417139, "learning_rate": 1.4836666666666667e-06, "loss": 0.0004, "num_tokens": 1640622.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 102.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.021426580846309662, "kl": 0.002101754449540749, "learning_rate": 1.4833333333333335e-06, "loss": 0.0001, "num_tokens": 1640910.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 102.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05924432352185249, "kl": 0.001211017370223999, "learning_rate": 1.483e-06, "loss": 0.0001, "num_tokens": 1641123.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 102.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04636835306882858, "kl": 0.002629845286719501, "learning_rate": 1.4826666666666668e-06, "loss": 0.0001, "num_tokens": 1641427.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 102.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.025436658412218094, "kl": 0.018649504985660315, "learning_rate": 1.4823333333333334e-06, "loss": 0.001, "num_tokens": 1641719.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 102.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.16768330335617065, "kl": 0.05305645242333412, "learning_rate": 1.482e-06, "loss": 0.0027, "num_tokens": 1642036.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 102.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.058733418583869934, "kl": 0.008151871152222157, "learning_rate": 1.4816666666666667e-06, "loss": 0.0004, "num_tokens": 1642352.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 102.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005205719266086817, "kl": 0.00038047814450692385, "learning_rate": 1.4813333333333333e-06, "loss": 0.0, "num_tokens": 1642666.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 102.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.12997817993164062, "kl": 0.07159293070435524, "learning_rate": 1.4809999999999999e-06, "loss": 0.0035, "num_tokens": 1643001.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 102.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03440767526626587, "kl": 0.01249383483082056, "learning_rate": 1.4806666666666669e-06, "loss": 0.0007, "num_tokens": 1643315.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 102.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.048427026718854904, "kl": 0.0012154094874858856, "learning_rate": 1.4803333333333334e-06, "loss": 0.0001, "num_tokens": 1643575.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 102.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.006804644130170345, "kl": 0.2676941752433777, "learning_rate": 1.48e-06, "loss": 0.0134, "num_tokens": 1643879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 103.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.05653773993253708, "kl": 0.01362143037840724, "learning_rate": 1.4796666666666668e-06, "loss": 0.0007, "num_tokens": 1644171.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 103.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02322981134057045, "kl": 0.010478208772838116, "learning_rate": 1.4793333333333334e-06, "loss": 0.0005, "num_tokens": 1644431.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 103.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0641045868396759, "kl": 0.0728888213634491, "learning_rate": 1.479e-06, "loss": 0.0037, "num_tokens": 1644809.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 103.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05591088905930519, "kl": 0.015923491679131985, "learning_rate": 1.4786666666666667e-06, "loss": 0.0008, "num_tokens": 1645153.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 103.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.016792595386505127, "kl": 0.0011760814231820405, "learning_rate": 1.4783333333333333e-06, "loss": 0.0001, "num_tokens": 1645477.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 103.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04982414469122887, "kl": 0.003077237866818905, "learning_rate": 1.4779999999999999e-06, "loss": 0.0002, "num_tokens": 1645789.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 103.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.12345301359891891, "kl": 0.006383342933986569, "learning_rate": 1.4776666666666669e-06, "loss": 0.0003, "num_tokens": 1646063.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 103.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.352280616760254, "kl": 0.005991955986246467, "learning_rate": 1.4773333333333334e-06, "loss": 0.0247, "num_tokens": 1646346.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.19035136699676514, "kl": 0.03718606033362448, "learning_rate": 1.477e-06, "loss": 0.0019, "num_tokens": 1646635.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 103.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.1458001136779785, "kl": 0.17720242589712143, "learning_rate": 1.4766666666666668e-06, "loss": 0.074, "num_tokens": 1646984.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 5571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 103.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.04590500146150589, "kl": 0.007351910462602973, "learning_rate": 1.4763333333333334e-06, "loss": 0.0004, "num_tokens": 1647301.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 103.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009573576389811933, "kl": 0.003726080060005188, "learning_rate": 1.476e-06, "loss": 0.0002, "num_tokens": 1647537.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 103.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.8554811477661133, "kl": 0.22396749258041382, "learning_rate": 1.4756666666666667e-06, "loss": 0.0387, "num_tokens": 1647847.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 103.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0559222511947155, "kl": 0.0165237532928586, "learning_rate": 1.4753333333333333e-06, "loss": 0.0009, "num_tokens": 1648120.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 103.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.011249528266489506, "kl": 0.0005452483892440796, "learning_rate": 1.4749999999999999e-06, "loss": 0.0, "num_tokens": 1648380.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 103.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02432558685541153, "kl": 0.0007009912806097418, "learning_rate": 1.4746666666666668e-06, "loss": 0.0, "num_tokens": 1648683.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 103.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.013705465942621231, "kl": 0.0003001019358634949, "learning_rate": 1.4743333333333334e-06, "loss": 0.0, "num_tokens": 1648889.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 103.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 5.446242809295654, "kl": 0.019625958055257797, "learning_rate": 1.474e-06, "loss": 0.0071, "num_tokens": 1649133.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 103.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 7.091035842895508, "kl": 0.024523595813661814, "learning_rate": 1.4736666666666668e-06, "loss": -0.0362, "num_tokens": 1649403.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 103.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005106464494019747, "kl": 0.00025747418112587184, "learning_rate": 1.4733333333333333e-06, "loss": 0.0, "num_tokens": 1649663.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 103.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.026800744235515594, "kl": 0.0011164993047714233, "learning_rate": 1.473e-06, "loss": 0.0001, "num_tokens": 1649879.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 103.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03890179097652435, "kl": 0.0009979253518395126, "learning_rate": 1.4726666666666667e-06, "loss": 0.0, "num_tokens": 1650175.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.21989014744758606, "kl": 0.03462527133524418, "learning_rate": 1.4723333333333333e-06, "loss": 0.0017, "num_tokens": 1650482.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 103.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 0.3923582434654236, "kl": 0.3532719388604164, "learning_rate": 1.472e-06, "loss": -0.0015, "num_tokens": 1650852.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 5585 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.007352941203862429, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 103.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.079900026321411, "kl": 0.1506493017077446, "learning_rate": 1.4716666666666668e-06, "loss": 0.0183, "num_tokens": 1651208.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 103.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014649950026068836, "kl": 4.5746564865112305e-06, "learning_rate": 1.4713333333333334e-06, "loss": 0.0, "num_tokens": 1651428.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.024087300524115562, "kl": 0.0020767542300745845, "learning_rate": 1.471e-06, "loss": 0.0001, "num_tokens": 1651712.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 103.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.912696838378906, "kl": 0.15406838431954384, "learning_rate": 1.4706666666666668e-06, "loss": 0.0229, "num_tokens": 1652038.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 103.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0738273486495018, "kl": 0.0131396206561476, "learning_rate": 1.4703333333333333e-06, "loss": 0.0007, "num_tokens": 1652328.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 103.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.14401409029960632, "kl": 0.024686934426426888, "learning_rate": 1.4700000000000001e-06, "loss": 0.0013, "num_tokens": 1652614.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 103.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05650556460022926, "kl": 0.006409379479009658, "learning_rate": 1.4696666666666667e-06, "loss": 0.0004, "num_tokens": 1652893.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 103.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.031053029000759125, "kl": 0.0013121436059009284, "learning_rate": 1.4693333333333333e-06, "loss": 0.0001, "num_tokens": 1653201.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 103.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.004150730557739735, "kl": 0.0003677785425679758, "learning_rate": 1.469e-06, "loss": 0.0, "num_tokens": 1653421.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 103.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.09290174394845963, "kl": 0.04169721156358719, "learning_rate": 1.4686666666666668e-06, "loss": 0.0021, "num_tokens": 1653717.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 103.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0070632826536893845, "kl": 0.2676911950111389, "learning_rate": 1.4683333333333334e-06, "loss": 0.0134, "num_tokens": 1654021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 103.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008437030017375946, "kl": 0.0005081224953755736, "learning_rate": 1.468e-06, "loss": 0.0, "num_tokens": 1654256.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 103.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.18121017515659332, "kl": 0.03690602537244558, "learning_rate": 1.4676666666666667e-06, "loss": 0.0018, "num_tokens": 1654558.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 103.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.059240154922008514, "kl": 0.03684787265956402, "learning_rate": 1.4673333333333333e-06, "loss": 0.0019, "num_tokens": 1654869.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 103.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 3.1925406455993652, "kl": 0.04431818501325324, "learning_rate": 1.467e-06, "loss": -0.0011, "num_tokens": 1655129.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 5600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 103.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.022968411445617676, "kl": 0.0036716292379423976, "learning_rate": 1.4666666666666667e-06, "loss": 0.0002, "num_tokens": 1655459.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.011363636702299118, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 103.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 5.631124973297119, "kl": 0.04231046140193939, "learning_rate": 1.4663333333333332e-06, "loss": 0.1384, "num_tokens": 1655810.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 5602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 103.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.05637514591217041, "kl": 0.006388418842107058, "learning_rate": 1.466e-06, "loss": 0.0003, "num_tokens": 1656143.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 103.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.042977165430784225, "kl": 0.004595709848217666, "learning_rate": 1.4656666666666668e-06, "loss": 0.0002, "num_tokens": 1656411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 103.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.1270960420370102, "kl": 0.008030398981645703, "learning_rate": 1.4653333333333334e-06, "loss": 0.0004, "num_tokens": 1656680.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 103.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.04916525259613991, "kl": 0.001803569495677948, "learning_rate": 1.4650000000000002e-06, "loss": 0.0001, "num_tokens": 1656948.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 103.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03579654544591904, "kl": 0.004255052888765931, "learning_rate": 1.4646666666666667e-06, "loss": 0.0002, "num_tokens": 1657241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 103.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06768617779016495, "kl": 0.002009347837883979, "learning_rate": 1.4643333333333333e-06, "loss": 0.0001, "num_tokens": 1657498.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 103.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.8535503149032593, "kl": 0.04825827130116522, "learning_rate": 1.464e-06, "loss": -0.0154, "num_tokens": 1657789.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05372335761785507, "kl": 0.006013734498992562, "learning_rate": 1.4636666666666667e-06, "loss": 0.0003, "num_tokens": 1658073.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 103.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03579818084836006, "kl": 0.0003338456153869629, "learning_rate": 1.4633333333333332e-06, "loss": 0.0, "num_tokens": 1658285.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08977561444044113, "kl": 0.017718197777867317, "learning_rate": 1.463e-06, "loss": 0.001, "num_tokens": 1658567.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 103.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.007300083991140127, "kl": 0.00182400643825531, "learning_rate": 1.4626666666666668e-06, "loss": 0.0001, "num_tokens": 1658783.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 103.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.46247953176498413, "kl": 0.10592610016465187, "learning_rate": 1.4623333333333334e-06, "loss": 0.0056, "num_tokens": 1659194.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 103.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03938408941030502, "kl": 0.0015298050711862743, "learning_rate": 1.4620000000000001e-06, "loss": 0.0001, "num_tokens": 1659516.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 104.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.12710459530353546, "kl": 0.03987656719982624, "learning_rate": 1.4616666666666667e-06, "loss": 0.002, "num_tokens": 1659887.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 104.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.122880220413208, "kl": 0.09745741845108569, "learning_rate": 1.4613333333333333e-06, "loss": 0.0406, "num_tokens": 1660169.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 104.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.015912484377622604, "kl": 0.0006690872251056135, "learning_rate": 1.461e-06, "loss": 0.0, "num_tokens": 1660431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 104.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02676847018301487, "kl": 0.003936985274776816, "learning_rate": 1.4606666666666666e-06, "loss": 0.0002, "num_tokens": 1660691.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 104.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.2112481594085693, "kl": 0.20111770555377007, "learning_rate": 1.4603333333333332e-06, "loss": 0.0674, "num_tokens": 1661029.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 104.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.7214977741241455, "kl": 0.02366841584444046, "learning_rate": 1.4600000000000002e-06, "loss": -0.0381, "num_tokens": 1661303.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 104.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08014106750488281, "kl": 0.031496042385697365, "learning_rate": 1.4596666666666668e-06, "loss": 0.0016, "num_tokens": 1661637.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 104.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10010218620300293, "kl": 0.004007552575785667, "learning_rate": 1.4593333333333334e-06, "loss": 0.0002, "num_tokens": 1661855.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 104.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 6.5151543617248535, "kl": 0.0829065702855587, "learning_rate": 1.4590000000000001e-06, "loss": 0.1502, "num_tokens": 1662138.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 104.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.08586783707141876, "kl": 0.01794201135635376, "learning_rate": 1.4586666666666667e-06, "loss": 0.0009, "num_tokens": 1662413.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 104.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.019931908696889877, "kl": 0.0005274638533592224, "learning_rate": 1.4583333333333333e-06, "loss": 0.0, "num_tokens": 1662623.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 104.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04691576585173607, "kl": 0.00750165106728673, "learning_rate": 1.458e-06, "loss": 0.0004, "num_tokens": 1662916.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 104.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008692933479323983, "kl": 0.003747113049030304, "learning_rate": 1.4576666666666666e-06, "loss": 0.0002, "num_tokens": 1663152.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 104.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.002916396129876375, "kl": 7.414072751998901e-05, "learning_rate": 1.4573333333333332e-06, "loss": 0.0, "num_tokens": 1663364.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 104.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.010454630479216576, "kl": 0.008650038857012987, "learning_rate": 1.4570000000000002e-06, "loss": 0.0004, "num_tokens": 1663636.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 104.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.021863967180252075, "kl": 0.0014585109311155975, "learning_rate": 1.4566666666666668e-06, "loss": 0.0001, "num_tokens": 1663940.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 104.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 2.051466703414917, "kl": 0.11221451126039028, "learning_rate": 1.4563333333333333e-06, "loss": -0.0166, "num_tokens": 1664201.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 104.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.051440466195344925, "kl": 0.0034037778386846185, "learning_rate": 1.4560000000000001e-06, "loss": 0.0002, "num_tokens": 1664465.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 104.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.08346538990736008, "kl": 0.006596399703994393, "learning_rate": 1.4556666666666667e-06, "loss": 0.0004, "num_tokens": 1664819.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 104.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019270085031166673, "kl": 6.192177897901274e-05, "learning_rate": 1.4553333333333333e-06, "loss": 0.0, "num_tokens": 1665091.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 104.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005148336756974459, "kl": 0.00031048059463500977, "learning_rate": 1.455e-06, "loss": 0.0, "num_tokens": 1665351.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 104.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04582850635051727, "kl": 0.002248986216727644, "learning_rate": 1.4546666666666666e-06, "loss": 0.0001, "num_tokens": 1665665.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 104.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06564143300056458, "kl": 0.0064549262169748545, "learning_rate": 1.4543333333333332e-06, "loss": 0.0003, "num_tokens": 1665995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 104.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.2788805663585663, "kl": 0.04196894774213433, "learning_rate": 1.4540000000000002e-06, "loss": 0.0023, "num_tokens": 1666282.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 104.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.02393532544374466, "kl": 0.03826040215790272, "learning_rate": 1.4536666666666668e-06, "loss": 0.0019, "num_tokens": 1666686.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 104.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.059652626514434814, "kl": 0.025676672346889973, "learning_rate": 1.4533333333333333e-06, "loss": 0.0013, "num_tokens": 1666986.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 104.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.033939704298973083, "kl": 0.00343378446996212, "learning_rate": 1.4530000000000001e-06, "loss": 0.0002, "num_tokens": 1667270.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 104.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.7776589393615723, "kl": 0.13623983785510063, "learning_rate": 1.4526666666666667e-06, "loss": 0.1711, "num_tokens": 1667611.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 5643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 104.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.4898393154144287, "kl": 0.10298647731542587, "learning_rate": 1.4523333333333332e-06, "loss": 0.0192, "num_tokens": 1667996.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 104.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.061361782252788544, "kl": 0.002435504808090627, "learning_rate": 1.452e-06, "loss": 0.0001, "num_tokens": 1668319.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 104.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03603056073188782, "kl": 0.006204102421179414, "learning_rate": 1.4516666666666666e-06, "loss": 0.0003, "num_tokens": 1668647.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 104.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07706590741872787, "kl": 0.04034225083887577, "learning_rate": 1.4513333333333334e-06, "loss": 0.002, "num_tokens": 1669010.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 104.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.014101438224315643, "kl": 0.00380882786703296, "learning_rate": 1.4510000000000002e-06, "loss": 0.0002, "num_tokens": 1669274.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 104.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03637513890862465, "kl": 0.0034911499824374914, "learning_rate": 1.4506666666666667e-06, "loss": 0.0002, "num_tokens": 1669600.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 104.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04727701470255852, "kl": 0.008573881816118956, "learning_rate": 1.4503333333333333e-06, "loss": 0.0004, "num_tokens": 1669928.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 104.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1317235231399536, "kl": 0.007987660821527243, "learning_rate": 1.45e-06, "loss": 0.0005, "num_tokens": 1670147.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 104.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016721435531508178, "kl": 5.558133125305176e-06, "learning_rate": 1.4496666666666667e-06, "loss": 0.0, "num_tokens": 1670367.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 104.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05192406848073006, "kl": 0.004072215291671455, "learning_rate": 1.4493333333333334e-06, "loss": 0.0002, "num_tokens": 1670667.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 104.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.2850669026374817, "kl": 0.04399473685771227, "learning_rate": 1.449e-06, "loss": 0.0024, "num_tokens": 1670963.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 104.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07298750430345535, "kl": 0.002120365621522069, "learning_rate": 1.4486666666666666e-06, "loss": 0.0002, "num_tokens": 1671179.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 104.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.07781278342008591, "kl": 0.004349625436589122, "learning_rate": 1.4483333333333334e-06, "loss": 0.0002, "num_tokens": 1671427.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 104.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.004988092463463545, "kl": 8.647441791254096e-05, "learning_rate": 1.4480000000000002e-06, "loss": 0.0, "num_tokens": 1671683.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 104.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0739312469959259, "kl": 0.009387121070176363, "learning_rate": 1.4476666666666667e-06, "loss": 0.0005, "num_tokens": 1672023.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 104.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03655466064810753, "kl": 0.004713650327175856, "learning_rate": 1.4473333333333333e-06, "loss": 0.0002, "num_tokens": 1672309.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 104.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.006822492927312851, "kl": 0.2677291929721832, "learning_rate": 1.447e-06, "loss": 0.0134, "num_tokens": 1672613.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 104.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03389998897910118, "kl": 0.002766357734799385, "learning_rate": 1.4466666666666667e-06, "loss": 0.0001, "num_tokens": 1672886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 104.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.20362868905067444, "kl": 0.03436734527349472, "learning_rate": 1.4463333333333334e-06, "loss": 0.0017, "num_tokens": 1673192.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 104.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0704147070646286, "kl": 0.16538894921541214, "learning_rate": 1.446e-06, "loss": 0.0083, "num_tokens": 1673501.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 104.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.021763790398836136, "kl": 0.0011930970067624003, "learning_rate": 1.4456666666666666e-06, "loss": 0.0001, "num_tokens": 1673736.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 104.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07429869472980499, "kl": 0.007324169855564833, "learning_rate": 1.4453333333333334e-06, "loss": 0.0004, "num_tokens": 1674030.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 104.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05000847950577736, "kl": 0.012160141952335835, "learning_rate": 1.4450000000000001e-06, "loss": 0.0006, "num_tokens": 1674316.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 104.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 7.011960983276367, "kl": 0.11599424760788679, "learning_rate": 1.4446666666666667e-06, "loss": -0.0461, "num_tokens": 1674586.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 104.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03617606684565544, "kl": 0.003139002248644829, "learning_rate": 1.4443333333333335e-06, "loss": 0.0002, "num_tokens": 1674898.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 104.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.36423632502555847, "kl": 0.05626663938164711, "learning_rate": 1.444e-06, "loss": 0.0029, "num_tokens": 1675227.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 105.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.8410458564758301, "kl": 0.20665767043828964, "learning_rate": 1.4436666666666666e-06, "loss": 0.0097, "num_tokens": 1675599.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 5670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 105.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05876588821411133, "kl": 0.0005195066332817078, "learning_rate": 1.4433333333333334e-06, "loss": 0.0, "num_tokens": 1675811.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.1140060424804688, "kl": 0.45542351389303803, "learning_rate": 1.443e-06, "loss": -0.0565, "num_tokens": 1676106.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 105.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.008352797478437424, "kl": 8.147954940795898e-05, "learning_rate": 1.4426666666666666e-06, "loss": 0.0, "num_tokens": 1676318.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 105.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.18303248286247253, "kl": 0.05314611457288265, "learning_rate": 1.4423333333333333e-06, "loss": 0.0026, "num_tokens": 1676638.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 105.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.8104941844940186, "kl": 0.27381379902362823, "learning_rate": 1.4420000000000001e-06, "loss": -0.0019, "num_tokens": 1676941.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 105.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02625441737473011, "kl": 0.003914707922376692, "learning_rate": 1.4416666666666667e-06, "loss": 0.0002, "num_tokens": 1677201.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 105.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.038965094834566116, "kl": 0.0033664272632449865, "learning_rate": 1.4413333333333335e-06, "loss": 0.0002, "num_tokens": 1677461.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 105.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 8.05448055267334, "kl": 0.28306935052387416, "learning_rate": 1.441e-06, "loss": 0.3229, "num_tokens": 1677754.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 105.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.566134214401245, "kl": 0.02297248411923647, "learning_rate": 1.4406666666666666e-06, "loss": 0.421, "num_tokens": 1678134.0, "reward": 7.300000190734863, "reward_std": 0.40000009536743164, "rewards/reward_combined/mean": 7.300000190734863, "rewards/reward_combined/std": 0.40000009536743164, "step": 5679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 105.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.026033587753772736, "kl": 0.0010442649654578418, "learning_rate": 1.4403333333333334e-06, "loss": 0.0001, "num_tokens": 1678448.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 105.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.47929310798645, "kl": 0.06732975505292416, "learning_rate": 1.44e-06, "loss": -0.061, "num_tokens": 1678809.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 5681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 105.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.9458839893341064, "kl": 0.05476866662502289, "learning_rate": 1.4396666666666665e-06, "loss": 0.3693, "num_tokens": 1679198.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 105.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.10348553210496902, "kl": 0.005979364272207022, "learning_rate": 1.4393333333333335e-06, "loss": 0.0004, "num_tokens": 1679408.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.4981484115123749, "kl": 0.06222796067595482, "learning_rate": 1.4390000000000001e-06, "loss": 0.0031, "num_tokens": 1679683.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 105.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.018115002661943436, "kl": 0.00020355880405986682, "learning_rate": 1.4386666666666667e-06, "loss": 0.0, "num_tokens": 1679939.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 105.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 7.157373905181885, "kl": 0.06725869793444872, "learning_rate": 1.4383333333333335e-06, "loss": 0.1024, "num_tokens": 1680309.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.010161695070564747, "kl": 0.0003168337279930711, "learning_rate": 1.438e-06, "loss": 0.0, "num_tokens": 1680627.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 105.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.004255264066159725, "kl": 0.0003784775617532432, "learning_rate": 1.4376666666666666e-06, "loss": 0.0, "num_tokens": 1680847.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 105.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.012154607102274895, "kl": 0.09744952619075775, "learning_rate": 1.4373333333333334e-06, "loss": 0.0049, "num_tokens": 1681219.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 105.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005785184446722269, "kl": 0.0007093921303749084, "learning_rate": 1.437e-06, "loss": 0.0, "num_tokens": 1681463.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 105.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04391011223196983, "kl": 0.004706941545009613, "learning_rate": 1.4366666666666665e-06, "loss": 0.0002, "num_tokens": 1681679.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 105.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06214816868305206, "kl": 0.009975632186979055, "learning_rate": 1.4363333333333335e-06, "loss": 0.0005, "num_tokens": 1682035.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07380525767803192, "kl": 0.008595675462856889, "learning_rate": 1.436e-06, "loss": 0.0004, "num_tokens": 1682329.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.2147248089313507, "kl": 0.019854821148328483, "learning_rate": 1.4356666666666667e-06, "loss": 0.0011, "num_tokens": 1682602.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 105.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.024057865142822266, "kl": 0.033351522870361805, "learning_rate": 1.4353333333333335e-06, "loss": 0.0017, "num_tokens": 1683006.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 105.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.19571448862552643, "kl": 0.030045789666473866, "learning_rate": 1.435e-06, "loss": 0.0016, "num_tokens": 1683338.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 105.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.00020817856420762837, "kl": 7.569789886474609e-06, "learning_rate": 1.4346666666666666e-06, "loss": 0.0, "num_tokens": 1683558.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009750555269420147, "kl": 0.004015401005744934, "learning_rate": 1.4343333333333334e-06, "loss": 0.0002, "num_tokens": 1683838.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 105.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.017596613615751266, "kl": 0.000529117402038537, "learning_rate": 1.434e-06, "loss": 0.0, "num_tokens": 1684108.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 105.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.000807145785074681, "kl": 0.0037572383880615234, "learning_rate": 1.4336666666666665e-06, "loss": 0.0002, "num_tokens": 1684344.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 105.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04011266306042671, "kl": 0.006399928824976087, "learning_rate": 1.4333333333333335e-06, "loss": 0.0003, "num_tokens": 1684639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 105.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.09016400575637817, "kl": 0.1637478619813919, "learning_rate": 1.433e-06, "loss": 0.0082, "num_tokens": 1684949.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 105.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1597907692193985, "kl": 0.03237841837108135, "learning_rate": 1.4326666666666667e-06, "loss": 0.0017, "num_tokens": 1685249.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 105.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 7.633193016052246, "kl": 0.8263365607708693, "learning_rate": 1.4323333333333334e-06, "loss": 0.176, "num_tokens": 1685608.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 105.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.2390611320734024, "kl": 0.03568451013416052, "learning_rate": 1.432e-06, "loss": 0.0018, "num_tokens": 1685930.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5705 }, { "clip_ratio/high_max": 0.006410256493836641, "clip_ratio/high_mean": 0.006410256493836641, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006410256493836641, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 105.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.5669021606445312, "kl": 0.17164346575737, "learning_rate": 1.4316666666666666e-06, "loss": 0.0211, "num_tokens": 1686310.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 105.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.8500077724456787, "kl": 0.04501553252339363, "learning_rate": 1.4313333333333334e-06, "loss": 0.2092, "num_tokens": 1686604.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 105.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.09812217950820923, "kl": 0.0398064237087965, "learning_rate": 1.431e-06, "loss": 0.0021, "num_tokens": 1686920.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 105.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.2176378071308136, "kl": 0.019492600520607084, "learning_rate": 1.4306666666666667e-06, "loss": 0.0011, "num_tokens": 1687202.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 105.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.048696164041757584, "kl": 0.006918958388268948, "learning_rate": 1.4303333333333335e-06, "loss": 0.0004, "num_tokens": 1687500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 105.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0727105364203453, "kl": 0.010666396003216505, "learning_rate": 1.43e-06, "loss": 0.0006, "num_tokens": 1687794.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 105.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09294035285711288, "kl": 0.009225911926478148, "learning_rate": 1.4296666666666666e-06, "loss": 0.0005, "num_tokens": 1688134.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 105.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.012998809106647968, "kl": 0.0015042490558698773, "learning_rate": 1.4293333333333334e-06, "loss": 0.0001, "num_tokens": 1688411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 105.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.8534812927246094, "kl": 0.0837114229798317, "learning_rate": 1.429e-06, "loss": 0.1406, "num_tokens": 1688760.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 105.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.039196960628032684, "kl": 0.0017774586740415543, "learning_rate": 1.4286666666666668e-06, "loss": 0.0001, "num_tokens": 1689062.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 105.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.002460814779624343, "kl": 0.00044571078615263104, "learning_rate": 1.4283333333333334e-06, "loss": 0.0, "num_tokens": 1689296.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 105.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08185262233018875, "kl": 0.007727860473096371, "learning_rate": 1.428e-06, "loss": 0.0004, "num_tokens": 1689568.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 105.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01775561459362507, "kl": 0.002369391731917858, "learning_rate": 1.4276666666666667e-06, "loss": 0.0001, "num_tokens": 1689880.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 105.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.3227272629737854, "kl": 0.057049446273595095, "learning_rate": 1.4273333333333335e-06, "loss": 0.0036, "num_tokens": 1690177.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 105.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.1344035565853119, "kl": 0.013386741280555725, "learning_rate": 1.427e-06, "loss": 0.0007, "num_tokens": 1690451.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.02759510837495327, "kl": 0.0032271374948322773, "learning_rate": 1.4266666666666666e-06, "loss": 0.0002, "num_tokens": 1690741.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 105.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 9.844181060791016, "kl": 0.060532329604029655, "learning_rate": 1.4263333333333334e-06, "loss": -0.1647, "num_tokens": 1691052.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 105.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.008276209235191345, "kl": 0.00040553510189056396, "learning_rate": 1.426e-06, "loss": 0.0, "num_tokens": 1691312.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 106.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.5478247404098511, "kl": 0.10455014044418931, "learning_rate": 1.4256666666666668e-06, "loss": 0.0056, "num_tokens": 1691582.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 106.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02557133510708809, "kl": 0.0004163682460784912, "learning_rate": 1.4253333333333333e-06, "loss": 0.0, "num_tokens": 1691792.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 106.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.050726309418678284, "kl": 0.011864948086440563, "learning_rate": 1.425e-06, "loss": 0.0006, "num_tokens": 1692126.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 106.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.0785820484161377, "kl": 0.05373286455869675, "learning_rate": 1.4246666666666667e-06, "loss": 0.0204, "num_tokens": 1692397.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 106.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.061437562108039856, "kl": 0.03129961155354977, "learning_rate": 1.4243333333333335e-06, "loss": 0.0016, "num_tokens": 1692730.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 106.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.13399158418178558, "kl": 0.024402556009590626, "learning_rate": 1.424e-06, "loss": 0.0013, "num_tokens": 1693063.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 106.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.8997130393981934, "kl": 0.11000430583953857, "learning_rate": 1.4236666666666668e-06, "loss": 0.169, "num_tokens": 1693423.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 106.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.107804775238037, "kl": 0.057356974110007286, "learning_rate": 1.4233333333333334e-06, "loss": 0.115, "num_tokens": 1693733.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 106.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03562408313155174, "kl": 0.009137207642197609, "learning_rate": 1.423e-06, "loss": 0.0004, "num_tokens": 1694028.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 106.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.026240911334753036, "kl": 0.0019951232243329287, "learning_rate": 1.4226666666666668e-06, "loss": 0.0001, "num_tokens": 1694324.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 106.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.044375013560056686, "kl": 0.0008645802736282349, "learning_rate": 1.4223333333333333e-06, "loss": 0.0, "num_tokens": 1694556.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 106.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0463753268122673, "kl": 0.01417520921677351, "learning_rate": 1.422e-06, "loss": 0.0007, "num_tokens": 1694836.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 106.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.1392743587493896, "kl": 0.10343851149082184, "learning_rate": 1.4216666666666667e-06, "loss": -0.0832, "num_tokens": 1695200.0, "reward": 5.625, "reward_std": 2.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 2.75, "step": 5736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 106.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02173640951514244, "kl": 0.000323873755405657, "learning_rate": 1.4213333333333335e-06, "loss": 0.0, "num_tokens": 1695456.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 106.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04348168149590492, "kl": 0.007410618243739009, "learning_rate": 1.421e-06, "loss": 0.0004, "num_tokens": 1695747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 106.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.010557022877037525, "kl": 0.163717582821846, "learning_rate": 1.4206666666666668e-06, "loss": 0.0082, "num_tokens": 1696055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 106.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.08273578435182571, "kl": 0.004094600095413625, "learning_rate": 1.4203333333333334e-06, "loss": 0.0002, "num_tokens": 1696368.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 106.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.06082385778427124, "kl": 0.007373373955488205, "learning_rate": 1.42e-06, "loss": 0.0004, "num_tokens": 1696707.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 106.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.6933389902114868, "kl": 0.10848630405962467, "learning_rate": 1.4196666666666667e-06, "loss": 0.092, "num_tokens": 1697095.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 106.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.014743284322321415, "kl": 0.09623927995562553, "learning_rate": 1.4193333333333333e-06, "loss": 0.0048, "num_tokens": 1697468.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 106.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008723534410819411, "kl": 0.003734111785888672, "learning_rate": 1.4189999999999999e-06, "loss": 0.0002, "num_tokens": 1697704.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 106.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.046619925647974014, "kl": 0.0046431400696747005, "learning_rate": 1.4186666666666669e-06, "loss": 0.0003, "num_tokens": 1697966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 106.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.009092474356293678, "kl": 0.001768447458744049, "learning_rate": 1.4183333333333334e-06, "loss": 0.0001, "num_tokens": 1698182.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 106.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0892452523112297, "kl": 0.006601458648219705, "learning_rate": 1.418e-06, "loss": 0.0004, "num_tokens": 1698456.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 106.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.01983843743801117, "kl": 0.0015462132869288325, "learning_rate": 1.4176666666666668e-06, "loss": 0.0001, "num_tokens": 1698740.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 106.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0084643280133605, "kl": 0.26748037338256836, "learning_rate": 1.4173333333333334e-06, "loss": 0.0134, "num_tokens": 1699044.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 106.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.700222492218018, "kl": 0.01191516499966383, "learning_rate": 1.417e-06, "loss": 0.2333, "num_tokens": 1699338.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 5750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 106.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.016187675297260284, "kl": 0.0024367207661271095, "learning_rate": 1.4166666666666667e-06, "loss": 0.0001, "num_tokens": 1699650.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 106.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02424757368862629, "kl": 0.0009895925759337842, "learning_rate": 1.4163333333333333e-06, "loss": 0.0, "num_tokens": 1699920.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 106.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.11237373948097229, "kl": 0.02521005505695939, "learning_rate": 1.4159999999999999e-06, "loss": 0.0013, "num_tokens": 1700270.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 106.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01388302631676197, "kl": 0.003629215876571834, "learning_rate": 1.4156666666666669e-06, "loss": 0.0002, "num_tokens": 1700548.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 106.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06473202258348465, "kl": 0.006293442100286484, "learning_rate": 1.4153333333333334e-06, "loss": 0.0003, "num_tokens": 1700869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 106.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.0050394535064697, "kl": 0.030086702667176723, "learning_rate": 1.415e-06, "loss": 0.0002, "num_tokens": 1701217.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 5756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 106.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 9.69318675994873, "kl": 0.02784450352191925, "learning_rate": 1.4146666666666668e-06, "loss": 0.0684, "num_tokens": 1701479.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 106.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.14703218638896942, "kl": 0.004087294219061732, "learning_rate": 1.4143333333333334e-06, "loss": 0.0003, "num_tokens": 1701699.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 106.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03757351636886597, "kl": 0.002771433792077005, "learning_rate": 1.414e-06, "loss": 0.0001, "num_tokens": 1701959.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 106.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002695656439755112, "kl": 1.4044344425201416e-05, "learning_rate": 1.4136666666666667e-06, "loss": 0.0, "num_tokens": 1702179.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 106.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.6176536083221436, "kl": 0.042499613016843796, "learning_rate": 1.4133333333333333e-06, "loss": 0.0027, "num_tokens": 1702473.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 5761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 106.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.09699300676584244, "kl": 0.015115597750991583, "learning_rate": 1.4129999999999999e-06, "loss": 0.0008, "num_tokens": 1702737.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 106.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 5.399329662322998, "kl": 0.02724478906020522, "learning_rate": 1.4126666666666668e-06, "loss": 0.0516, "num_tokens": 1703018.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 106.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.006087391637265682, "kl": 0.00012576580047607422, "learning_rate": 1.4123333333333334e-06, "loss": 0.0, "num_tokens": 1703230.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 106.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.06041878089308739, "kl": 0.00847070338204503, "learning_rate": 1.412e-06, "loss": 0.0004, "num_tokens": 1703522.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 106.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03455105051398277, "kl": 0.002691088360734284, "learning_rate": 1.4116666666666668e-06, "loss": 0.0001, "num_tokens": 1703830.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 106.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01585678942501545, "kl": 0.035224828869104385, "learning_rate": 1.4113333333333333e-06, "loss": 0.0017, "num_tokens": 1704247.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 106.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.08282896131277084, "kl": 0.0035541802644729614, "learning_rate": 1.411e-06, "loss": 0.0002, "num_tokens": 1704491.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 106.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.026355255395174026, "kl": 0.0038483203388750553, "learning_rate": 1.4106666666666667e-06, "loss": 0.0002, "num_tokens": 1704751.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 106.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.28263741731643677, "kl": 0.033234777161851525, "learning_rate": 1.4103333333333333e-06, "loss": 0.0015, "num_tokens": 1705074.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 106.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03897060081362724, "kl": 0.01066239271312952, "learning_rate": 1.41e-06, "loss": 0.0005, "num_tokens": 1705358.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 106.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034062473569065332, "kl": 0.0003975331783294678, "learning_rate": 1.4096666666666668e-06, "loss": 0.0, "num_tokens": 1705578.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 106.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04890957847237587, "kl": 0.009005682077258825, "learning_rate": 1.4093333333333334e-06, "loss": 0.0005, "num_tokens": 1705846.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 106.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.009102567099034786, "kl": 0.0004160125972703099, "learning_rate": 1.409e-06, "loss": 0.0, "num_tokens": 1706164.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 106.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008985929889604449, "kl": 0.001148223876953125, "learning_rate": 1.4086666666666668e-06, "loss": 0.0001, "num_tokens": 1706444.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 106.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04493812471628189, "kl": 0.04164758883416653, "learning_rate": 1.4083333333333333e-06, "loss": 0.0022, "num_tokens": 1706796.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 106.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07303241640329361, "kl": 0.006453113630414009, "learning_rate": 1.4080000000000001e-06, "loss": 0.0003, "num_tokens": 1707067.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 107.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09221452474594116, "kl": 0.01181476260535419, "learning_rate": 1.4076666666666667e-06, "loss": 0.0006, "num_tokens": 1707337.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.9778470993041992, "kl": 0.011919782496988773, "learning_rate": 1.4073333333333333e-06, "loss": 0.0454, "num_tokens": 1707630.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 107.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.029590746387839317, "kl": 0.0021514305844902992, "learning_rate": 1.407e-06, "loss": 0.0001, "num_tokens": 1707939.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 107.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.07139435410499573, "kl": 0.013602410908788443, "learning_rate": 1.4066666666666668e-06, "loss": 0.0007, "num_tokens": 1708270.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 107.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0078984210267663, "kl": 0.26756927371025085, "learning_rate": 1.4063333333333334e-06, "loss": 0.0134, "num_tokens": 1708574.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 107.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06545332074165344, "kl": 0.01546543464064598, "learning_rate": 1.406e-06, "loss": 0.0009, "num_tokens": 1708856.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 107.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.006662515923380852, "kl": 4.482269287109375e-05, "learning_rate": 1.4056666666666667e-06, "loss": 0.0, "num_tokens": 1709068.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 107.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.005747989285737276, "kl": 0.000629764050245285, "learning_rate": 1.4053333333333333e-06, "loss": 0.0, "num_tokens": 1709312.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 107.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05726177617907524, "kl": 0.013715836685150862, "learning_rate": 1.405e-06, "loss": 0.0007, "num_tokens": 1709618.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 107.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.012316490523517132, "kl": 0.09743694961071014, "learning_rate": 1.4046666666666667e-06, "loss": 0.0049, "num_tokens": 1709990.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 107.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03672131150960922, "kl": 0.0024512840900570154, "learning_rate": 1.4043333333333332e-06, "loss": 0.0001, "num_tokens": 1710250.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06388644874095917, "kl": 0.0028850616654381156, "learning_rate": 1.404e-06, "loss": 0.0001, "num_tokens": 1710520.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 107.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05095580220222473, "kl": 0.00374322640709579, "learning_rate": 1.4036666666666668e-06, "loss": 0.0002, "num_tokens": 1710825.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.034039080142974854, "kl": 0.00621016975492239, "learning_rate": 1.4033333333333334e-06, "loss": 0.0003, "num_tokens": 1711115.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 107.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.21955059468746185, "kl": 0.026480155996978283, "learning_rate": 1.4030000000000002e-06, "loss": 0.0012, "num_tokens": 1711376.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 107.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.10853764414787292, "kl": 0.018656094325706363, "learning_rate": 1.4026666666666667e-06, "loss": 0.001, "num_tokens": 1711734.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.009091355837881565, "kl": 0.00041239398706238717, "learning_rate": 1.4023333333333333e-06, "loss": 0.0, "num_tokens": 1712054.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 107.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.07535656541585922, "kl": 0.1569998860359192, "learning_rate": 1.402e-06, "loss": 0.0078, "num_tokens": 1712369.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 107.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.5146821141242981, "kl": 0.10110709443688393, "learning_rate": 1.4016666666666667e-06, "loss": 0.005, "num_tokens": 1712786.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 107.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.6395635008811951, "kl": 0.07085668295621872, "learning_rate": 1.4013333333333332e-06, "loss": 0.0039, "num_tokens": 1713077.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 107.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1346423178911209, "kl": 0.015416470589116216, "learning_rate": 1.401e-06, "loss": 0.0008, "num_tokens": 1713407.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 107.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.016915321350098, "kl": 0.12276065722107887, "learning_rate": 1.4006666666666668e-06, "loss": -0.1486, "num_tokens": 1713766.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 5799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 107.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.519293785095215, "kl": 0.03627302497625351, "learning_rate": 1.4003333333333334e-06, "loss": -0.0489, "num_tokens": 1714116.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 107.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.4300806522369385, "kl": 0.6645773788914084, "learning_rate": 1.4000000000000001e-06, "loss": -0.0207, "num_tokens": 1714388.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 107.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.007795859128236771, "kl": 0.0020574331283569336, "learning_rate": 1.3996666666666667e-06, "loss": 0.0001, "num_tokens": 1714604.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 107.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.033531829714775085, "kl": 0.0029731886461377144, "learning_rate": 1.3993333333333333e-06, "loss": 0.0001, "num_tokens": 1714916.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 107.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.84966778755188, "kl": 0.046946557238698006, "learning_rate": 1.399e-06, "loss": -0.1613, "num_tokens": 1715227.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 5804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 107.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.026002289727330208, "kl": 0.0014131814241409302, "learning_rate": 1.3986666666666666e-06, "loss": 0.0001, "num_tokens": 1715439.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 107.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03955051675438881, "kl": 0.002813410828821361, "learning_rate": 1.3983333333333332e-06, "loss": 0.0001, "num_tokens": 1715657.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 107.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.937253952026367, "kl": 0.12958155944943428, "learning_rate": 1.3980000000000002e-06, "loss": -0.0412, "num_tokens": 1715948.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 107.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.08042522519826889, "kl": 0.031298305839300156, "learning_rate": 1.3976666666666668e-06, "loss": 0.0016, "num_tokens": 1716250.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 107.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.7190862894058228, "kl": 0.07656094618141651, "learning_rate": 1.3973333333333334e-06, "loss": 0.0428, "num_tokens": 1716586.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 5809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 107.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.008943743072450161, "kl": 0.00032437642221339047, "learning_rate": 1.3970000000000001e-06, "loss": 0.0, "num_tokens": 1716856.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 107.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.2300005555152893, "kl": 0.022971992380917072, "learning_rate": 1.3966666666666667e-06, "loss": 0.0012, "num_tokens": 1717136.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 107.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.12222582846879959, "kl": 0.041981762275099754, "learning_rate": 1.3963333333333333e-06, "loss": 0.0024, "num_tokens": 1717457.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008262148476205766, "kl": 0.001231085043400526, "learning_rate": 1.396e-06, "loss": 0.0001, "num_tokens": 1717737.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 107.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.08274615556001663, "kl": 0.00374753400683403, "learning_rate": 1.3956666666666666e-06, "loss": 0.0002, "num_tokens": 1717997.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 107.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05769728869199753, "kl": 0.007248182548210025, "learning_rate": 1.3953333333333332e-06, "loss": 0.0004, "num_tokens": 1718269.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 107.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.3610689640045166, "kl": 0.04986991360783577, "learning_rate": 1.3950000000000002e-06, "loss": 0.0078, "num_tokens": 1718637.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 107.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.14683116972446442, "kl": 0.012522359378635883, "learning_rate": 1.3946666666666668e-06, "loss": 0.0006, "num_tokens": 1718897.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 107.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.07415807247161865, "kl": 0.01777966320514679, "learning_rate": 1.3943333333333333e-06, "loss": 0.0009, "num_tokens": 1719222.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 107.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0116240493953228, "kl": 0.0008336433675140142, "learning_rate": 1.3940000000000001e-06, "loss": 0.0, "num_tokens": 1719457.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 107.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.008049916476011276, "kl": 0.00030324608087539673, "learning_rate": 1.3936666666666667e-06, "loss": 0.0, "num_tokens": 1719665.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 107.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.018872126936912537, "kl": 0.002289417083375156, "learning_rate": 1.3933333333333333e-06, "loss": 0.0001, "num_tokens": 1719947.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 107.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007522659143432975, "kl": 0.0037667453289031982, "learning_rate": 1.393e-06, "loss": 0.0002, "num_tokens": 1720183.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 107.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.02633531577885151, "kl": 0.0032656221883371472, "learning_rate": 1.3926666666666666e-06, "loss": 0.0002, "num_tokens": 1720443.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 107.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10278723388910294, "kl": 0.036409737542271614, "learning_rate": 1.3923333333333332e-06, "loss": 0.0017, "num_tokens": 1720720.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 107.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0425138883292675, "kl": 0.004189606406725943, "learning_rate": 1.3920000000000002e-06, "loss": 0.0002, "num_tokens": 1720995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 107.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001723933091852814, "kl": 5.6549906730651855e-06, "learning_rate": 1.3916666666666668e-06, "loss": 0.0, "num_tokens": 1721215.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 107.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03437779098749161, "kl": 0.002885287278331816, "learning_rate": 1.3913333333333333e-06, "loss": 0.0001, "num_tokens": 1721543.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 107.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07442420721054077, "kl": 0.011151036713272333, "learning_rate": 1.3910000000000001e-06, "loss": 0.0006, "num_tokens": 1721843.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 107.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.9135982990264893, "kl": 0.12119313701987267, "learning_rate": 1.3906666666666667e-06, "loss": 0.2191, "num_tokens": 1722223.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 107.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.034506842494010925, "kl": 0.0037065488286316395, "learning_rate": 1.3903333333333332e-06, "loss": 0.0002, "num_tokens": 1722551.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 107.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.7274201512336731, "kl": 0.03486140817403793, "learning_rate": 1.39e-06, "loss": 0.0017, "num_tokens": 1722819.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 108.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.028645237907767296, "kl": 0.0005542500639421633, "learning_rate": 1.3896666666666666e-06, "loss": 0.0, "num_tokens": 1723076.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 108.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.29568982124328613, "kl": 0.041215645149350166, "learning_rate": 1.3893333333333334e-06, "loss": 0.0024, "num_tokens": 1723364.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 108.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 6.688390254974365, "kl": 0.031630974262952805, "learning_rate": 1.3890000000000002e-06, "loss": 0.3448, "num_tokens": 1723681.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 5834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 108.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.043932944536209106, "kl": 0.017253287136554718, "learning_rate": 1.3886666666666667e-06, "loss": 0.0009, "num_tokens": 1723973.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 42.75, "completions/mean_terminated_length": 42.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 108.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.387324333190918, "kl": 0.08686452358961105, "learning_rate": 1.3883333333333333e-06, "loss": 0.0595, "num_tokens": 1724360.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 108.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00738773075863719, "kl": 0.0019551292061805725, "learning_rate": 1.388e-06, "loss": 0.0001, "num_tokens": 1724576.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 108.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03217955306172371, "kl": 0.006154460366815329, "learning_rate": 1.3876666666666667e-06, "loss": 0.0003, "num_tokens": 1724912.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 108.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02707524597644806, "kl": 0.0025644907727837563, "learning_rate": 1.3873333333333334e-06, "loss": 0.0001, "num_tokens": 1725191.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 108.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.013815261423587799, "kl": 0.09787509590387344, "learning_rate": 1.387e-06, "loss": 0.0049, "num_tokens": 1725563.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 108.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.011325486935675144, "kl": 0.00042543228482827544, "learning_rate": 1.3866666666666666e-06, "loss": 0.0, "num_tokens": 1725882.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 108.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 16.690441131591797, "kl": 1.1164008472114801, "learning_rate": 1.3863333333333334e-06, "loss": 0.0721, "num_tokens": 1726156.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 5842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 108.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.1334773600101471, "kl": 0.042811835184693336, "learning_rate": 1.3860000000000002e-06, "loss": 0.0021, "num_tokens": 1726473.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 108.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.16800856590271, "kl": 0.0005193196120671928, "learning_rate": 1.3856666666666667e-06, "loss": 0.0516, "num_tokens": 1726789.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 108.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.027450840920209885, "kl": 0.0009651482105255127, "learning_rate": 1.3853333333333333e-06, "loss": 0.0001, "num_tokens": 1727005.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 108.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.021008362993597984, "kl": 0.0007641123665962368, "learning_rate": 1.385e-06, "loss": 0.0, "num_tokens": 1727283.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 108.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.010142466984689236, "kl": 0.0020509595051407814, "learning_rate": 1.3846666666666667e-06, "loss": 0.0001, "num_tokens": 1727595.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 108.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.07904906570911407, "kl": 0.06495491415262222, "learning_rate": 1.3843333333333334e-06, "loss": 0.0032, "num_tokens": 1727931.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 108.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.950024127960205, "kl": 0.15229973196983337, "learning_rate": 1.384e-06, "loss": -0.0119, "num_tokens": 1728216.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 108.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 5.519428730010986, "kl": 0.059378063306212425, "learning_rate": 1.3836666666666666e-06, "loss": -0.0929, "num_tokens": 1728532.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 5850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 108.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01905548758804798, "kl": 0.0007375776185654104, "learning_rate": 1.3833333333333334e-06, "loss": 0.0, "num_tokens": 1728796.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 108.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0805647149682045, "kl": 0.017920501995831728, "learning_rate": 1.3830000000000001e-06, "loss": 0.001, "num_tokens": 1729059.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 108.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.00022450012329500169, "kl": 9.082257747650146e-06, "learning_rate": 1.3826666666666667e-06, "loss": 0.0, "num_tokens": 1729279.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 108.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.15444353222846985, "kl": 0.02643286157399416, "learning_rate": 1.3823333333333335e-06, "loss": 0.0013, "num_tokens": 1729547.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 108.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 5.534971714019775, "kl": 0.09001019224524498, "learning_rate": 1.382e-06, "loss": -0.0189, "num_tokens": 1729849.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 108.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04774514585733414, "kl": 0.006973513402044773, "learning_rate": 1.3816666666666666e-06, "loss": 0.0003, "num_tokens": 1730138.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 108.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 7.169586658477783, "kl": 0.1647091545164585, "learning_rate": 1.3813333333333334e-06, "loss": 0.1695, "num_tokens": 1730476.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 108.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03908167779445648, "kl": 0.0012298872461542487, "learning_rate": 1.381e-06, "loss": 0.0001, "num_tokens": 1730711.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 108.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.059330303221940994, "kl": 0.004590214230120182, "learning_rate": 1.3806666666666666e-06, "loss": 0.0002, "num_tokens": 1731017.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 108.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.012992694973945618, "kl": 0.0009702034294605255, "learning_rate": 1.3803333333333333e-06, "loss": 0.0, "num_tokens": 1731277.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 108.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.5483787059783936, "kl": 0.07131713803391904, "learning_rate": 1.3800000000000001e-06, "loss": 0.0039, "num_tokens": 1731564.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 108.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01057702861726284, "kl": 0.0004176706133875996, "learning_rate": 1.3796666666666667e-06, "loss": 0.0, "num_tokens": 1731834.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 108.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.018096506595611572, "kl": 0.0004948079586029053, "learning_rate": 1.3793333333333335e-06, "loss": 0.0, "num_tokens": 1732042.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 108.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06367022544145584, "kl": 0.02664570207707584, "learning_rate": 1.379e-06, "loss": 0.0009, "num_tokens": 1732365.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 108.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.006638620048761368, "kl": 0.2678298354148865, "learning_rate": 1.3786666666666666e-06, "loss": 0.0134, "num_tokens": 1732669.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 108.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02574746496975422, "kl": 0.033431777730584145, "learning_rate": 1.3783333333333334e-06, "loss": 0.0017, "num_tokens": 1733073.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 108.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.013389160856604576, "kl": 0.0002528697223169729, "learning_rate": 1.378e-06, "loss": 0.0, "num_tokens": 1733286.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 108.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0481690987944603, "kl": 0.0010759511023934465, "learning_rate": 1.3776666666666665e-06, "loss": 0.0001, "num_tokens": 1733543.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 108.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.645321846008301, "kl": 0.0175365237519145, "learning_rate": 1.3773333333333335e-06, "loss": -0.0406, "num_tokens": 1733842.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 108.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.09285957366228104, "kl": 0.042318107560276985, "learning_rate": 1.3770000000000001e-06, "loss": 0.0021, "num_tokens": 1734181.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 108.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.1753692626953125, "kl": 0.17156246025115252, "learning_rate": 1.3766666666666667e-06, "loss": 0.1495, "num_tokens": 1734454.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 108.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.040205683559179306, "kl": 0.0023798730690032244, "learning_rate": 1.3763333333333335e-06, "loss": 0.0001, "num_tokens": 1734724.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 108.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.028733454644680023, "kl": 0.019942507147789, "learning_rate": 1.376e-06, "loss": 0.001, "num_tokens": 1735086.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 108.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04645894840359688, "kl": 0.003785040695220232, "learning_rate": 1.3756666666666666e-06, "loss": 0.0002, "num_tokens": 1735384.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 108.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.06303104013204575, "kl": 0.0027426481246948242, "learning_rate": 1.3753333333333334e-06, "loss": 0.0001, "num_tokens": 1735638.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 108.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.31005018949508667, "kl": 0.054848093539476395, "learning_rate": 1.375e-06, "loss": 0.0026, "num_tokens": 1735926.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 108.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.040029969066381454, "kl": 0.16291894018650055, "learning_rate": 1.3746666666666665e-06, "loss": 0.0081, "num_tokens": 1736235.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 108.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.004446973092854023, "kl": 0.0003317773371236399, "learning_rate": 1.3743333333333335e-06, "loss": 0.0, "num_tokens": 1736495.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 108.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008780839270912111, "kl": 0.0037524476647377014, "learning_rate": 1.374e-06, "loss": 0.0002, "num_tokens": 1736731.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 108.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06579489260911942, "kl": 0.00977464858442545, "learning_rate": 1.3736666666666667e-06, "loss": 0.0005, "num_tokens": 1737062.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 108.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04814428091049194, "kl": 0.01773000694811344, "learning_rate": 1.3733333333333335e-06, "loss": 0.0009, "num_tokens": 1737358.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 108.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.11146484315395355, "kl": 0.02189553901553154, "learning_rate": 1.373e-06, "loss": 0.0011, "num_tokens": 1737682.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 108.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.3046683967113495, "kl": 0.02081088093109429, "learning_rate": 1.3726666666666666e-06, "loss": 0.0012, "num_tokens": 1737903.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 108.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.0937881469726562, "kl": 0.08825670927762985, "learning_rate": 1.3723333333333334e-06, "loss": 0.022, "num_tokens": 1738288.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 108.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.5862605571746826, "kl": 0.07976600714027882, "learning_rate": 1.372e-06, "loss": 0.0382, "num_tokens": 1738643.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 109.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.10566570609807968, "kl": 0.005577021976932883, "learning_rate": 1.3716666666666665e-06, "loss": 0.0003, "num_tokens": 1738903.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 109.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03496198356151581, "kl": 0.002271134697366506, "learning_rate": 1.3713333333333335e-06, "loss": 0.0001, "num_tokens": 1739176.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 109.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01300653163343668, "kl": 0.009744971990585327, "learning_rate": 1.371e-06, "loss": 0.0005, "num_tokens": 1739480.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 109.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05436874181032181, "kl": 0.012729277834296227, "learning_rate": 1.3706666666666667e-06, "loss": 0.0006, "num_tokens": 1739773.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 109.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03597012534737587, "kl": 0.003065040917135775, "learning_rate": 1.3703333333333334e-06, "loss": 0.0002, "num_tokens": 1740055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0835784301161766, "kl": 0.0030799253727309406, "learning_rate": 1.37e-06, "loss": 0.0001, "num_tokens": 1740274.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00026008597342297435, "kl": 1.1272728443145752e-05, "learning_rate": 1.3696666666666666e-06, "loss": 0.0, "num_tokens": 1740494.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 109.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03182618319988251, "kl": 0.053854506462812424, "learning_rate": 1.3693333333333334e-06, "loss": 0.0027, "num_tokens": 1740859.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 109.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.1318359375, "kl": 0.2338090594857931, "learning_rate": 1.369e-06, "loss": 0.0629, "num_tokens": 1741187.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 109.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.8306169509887695, "kl": 0.16191583452746272, "learning_rate": 1.3686666666666667e-06, "loss": 0.0818, "num_tokens": 1741487.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 109.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.15355105698108673, "kl": 0.028837244026362896, "learning_rate": 1.3683333333333335e-06, "loss": 0.0014, "num_tokens": 1741755.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 109.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.1706068515777588, "kl": 0.020157979801297188, "learning_rate": 1.368e-06, "loss": 0.001, "num_tokens": 1742049.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.000847788352984935, "kl": 0.0037572309374809265, "learning_rate": 1.3676666666666666e-06, "loss": 0.0002, "num_tokens": 1742285.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 109.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0958934798836708, "kl": 0.027908511459827423, "learning_rate": 1.3673333333333334e-06, "loss": 0.0014, "num_tokens": 1742587.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 109.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.035112205892801285, "kl": 0.006599330343306065, "learning_rate": 1.367e-06, "loss": 0.0003, "num_tokens": 1742847.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 109.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02493727020919323, "kl": 0.0010398050071671605, "learning_rate": 1.3666666666666666e-06, "loss": 0.0001, "num_tokens": 1743173.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 109.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03313284367322922, "kl": 0.007278790697455406, "learning_rate": 1.3663333333333334e-06, "loss": 0.0004, "num_tokens": 1743462.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 109.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.003395878477022052, "kl": 0.000351424008840695, "learning_rate": 1.366e-06, "loss": 0.0, "num_tokens": 1743734.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 109.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.22905008494853973, "kl": 0.02140538615640253, "learning_rate": 1.3656666666666667e-06, "loss": 0.001, "num_tokens": 1744004.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 109.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.11046409606933594, "kl": 0.02448669052682817, "learning_rate": 1.3653333333333335e-06, "loss": 0.0012, "num_tokens": 1744292.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 109.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06299640983343124, "kl": 0.002376459538936615, "learning_rate": 1.365e-06, "loss": 0.0001, "num_tokens": 1744552.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 109.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.08487951755523682, "kl": 0.00908275693655014, "learning_rate": 1.3646666666666666e-06, "loss": 0.0004, "num_tokens": 1744858.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 109.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.1807339191436768, "kl": 0.11271590366959572, "learning_rate": 1.3643333333333334e-06, "loss": 0.0134, "num_tokens": 1745228.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.022632814943790436, "kl": 0.0023123383289203048, "learning_rate": 1.364e-06, "loss": 0.0001, "num_tokens": 1745482.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 109.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07472050935029984, "kl": 0.013483912451192737, "learning_rate": 1.3636666666666668e-06, "loss": 0.0008, "num_tokens": 1745748.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 109.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07059096544981003, "kl": 0.1631886214017868, "learning_rate": 1.3633333333333333e-06, "loss": 0.0082, "num_tokens": 1746058.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 109.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.14876939356327057, "kl": 0.03801564872264862, "learning_rate": 1.363e-06, "loss": 0.0022, "num_tokens": 1746387.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 109.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.009307836182415485, "kl": 0.09792861342430115, "learning_rate": 1.3626666666666667e-06, "loss": 0.0049, "num_tokens": 1746759.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 109.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.21848231554031372, "kl": 0.026918042451143265, "learning_rate": 1.3623333333333335e-06, "loss": 0.0013, "num_tokens": 1747084.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 109.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05574426054954529, "kl": 0.027406545355916023, "learning_rate": 1.362e-06, "loss": 0.0014, "num_tokens": 1747398.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 109.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.5493011474609375, "kl": 0.11865761131048203, "learning_rate": 1.3616666666666668e-06, "loss": -0.0768, "num_tokens": 1747670.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 109.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.1623677909374237, "kl": 0.02765484107658267, "learning_rate": 1.3613333333333334e-06, "loss": 0.0014, "num_tokens": 1747999.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 109.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.097598075866699, "kl": 0.07918963208794594, "learning_rate": 1.361e-06, "loss": 0.0532, "num_tokens": 1748273.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 109.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.024052917957305908, "kl": 0.0011083036661148071, "learning_rate": 1.3606666666666668e-06, "loss": 0.0001, "num_tokens": 1748485.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 109.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.019945377483963966, "kl": 0.05039198696613312, "learning_rate": 1.3603333333333333e-06, "loss": 0.0025, "num_tokens": 1748817.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 109.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.038226112723350525, "kl": 0.0014246970458771102, "learning_rate": 1.36e-06, "loss": 0.0001, "num_tokens": 1749073.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 109.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.1107323095202446, "kl": 0.004545584321022034, "learning_rate": 1.3596666666666667e-06, "loss": 0.0002, "num_tokens": 1749284.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 109.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.050962068140506744, "kl": 0.0052134746219962835, "learning_rate": 1.3593333333333335e-06, "loss": 0.0003, "num_tokens": 1749556.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.061520714312791824, "kl": 0.0009494051337242126, "learning_rate": 1.359e-06, "loss": 0.0, "num_tokens": 1749768.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 109.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.7254226803779602, "kl": 0.06566239148378372, "learning_rate": 1.3586666666666668e-06, "loss": 0.0035, "num_tokens": 1750013.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 109.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0523926205933094, "kl": 0.001845061022322625, "learning_rate": 1.3583333333333334e-06, "loss": 0.0001, "num_tokens": 1750279.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 109.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02411804534494877, "kl": 0.009334418457001448, "learning_rate": 1.358e-06, "loss": 0.0005, "num_tokens": 1750551.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.3197144567966461, "kl": 0.025151771493256092, "learning_rate": 1.3576666666666667e-06, "loss": 0.0017, "num_tokens": 1750797.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 109.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.20844948291778564, "kl": 0.022541755810379982, "learning_rate": 1.3573333333333333e-06, "loss": 0.0011, "num_tokens": 1751131.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 109.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.048450078815221786, "kl": 0.023419609293341637, "learning_rate": 1.3569999999999999e-06, "loss": 0.0012, "num_tokens": 1751485.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 109.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.247184991836548, "kl": 0.030101838521659374, "learning_rate": 1.3566666666666669e-06, "loss": 0.2049, "num_tokens": 1751814.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 109.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.008923697285354137, "kl": 0.0018349047750234604, "learning_rate": 1.3563333333333334e-06, "loss": 0.0001, "num_tokens": 1752126.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 109.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.0056344270706177, "kl": 0.24381319480016828, "learning_rate": 1.356e-06, "loss": -0.0378, "num_tokens": 1752449.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 109.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.001068216748535633, "kl": 0.0013050096458755434, "learning_rate": 1.3556666666666668e-06, "loss": 0.0001, "num_tokens": 1752729.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 109.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03234436735510826, "kl": 0.0012776028888765723, "learning_rate": 1.3553333333333334e-06, "loss": 0.0001, "num_tokens": 1752963.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 109.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07403942197561264, "kl": 0.004665711574489251, "learning_rate": 1.355e-06, "loss": 0.0002, "num_tokens": 1753277.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 109.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.7347073554992676, "kl": 0.005849546520039439, "learning_rate": 1.3546666666666667e-06, "loss": -0.0333, "num_tokens": 1753565.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 109.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.005078271962702274, "kl": 0.26810602843761444, "learning_rate": 1.3543333333333333e-06, "loss": 0.0134, "num_tokens": 1753869.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 109.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01574855111539364, "kl": 0.043678248301148415, "learning_rate": 1.3539999999999999e-06, "loss": 0.0022, "num_tokens": 1754273.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 110.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08108259737491608, "kl": 0.012996236328035593, "learning_rate": 1.3536666666666669e-06, "loss": 0.0006, "num_tokens": 1754609.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 110.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.11689068377017975, "kl": 0.026157384738326073, "learning_rate": 1.3533333333333334e-06, "loss": 0.0013, "num_tokens": 1754883.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 110.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001752555399434641, "kl": 5.6549906730651855e-06, "learning_rate": 1.353e-06, "loss": 0.0, "num_tokens": 1755103.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 110.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.009663878940045834, "kl": 0.09788351133465767, "learning_rate": 1.3526666666666668e-06, "loss": 0.0049, "num_tokens": 1755475.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 110.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.018762705847620964, "kl": 0.0026635846588760614, "learning_rate": 1.3523333333333334e-06, "loss": 0.0001, "num_tokens": 1755807.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06893157213926315, "kl": 0.009115293622016907, "learning_rate": 1.352e-06, "loss": 0.0004, "num_tokens": 1756086.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 110.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.328707695007324, "kl": 0.1779472827911377, "learning_rate": 1.3516666666666667e-06, "loss": 0.0482, "num_tokens": 1756449.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 110.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10574982315301895, "kl": 0.006369042210280895, "learning_rate": 1.3513333333333333e-06, "loss": 0.0003, "num_tokens": 1756745.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 110.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05269327759742737, "kl": 0.0019246204756200314, "learning_rate": 1.3509999999999999e-06, "loss": 0.0001, "num_tokens": 1757006.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 110.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 2.5032901763916016, "kl": 0.34883364103734493, "learning_rate": 1.3506666666666668e-06, "loss": 0.0195, "num_tokens": 1757358.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 110.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0053315297700464725, "kl": 0.00042948126792907715, "learning_rate": 1.3503333333333334e-06, "loss": 0.0, "num_tokens": 1757578.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 110.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 1.1845754384994507, "kl": 0.12331453897058964, "learning_rate": 1.35e-06, "loss": 0.0068, "num_tokens": 1757986.0, "reward": 1.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 1.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 5951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 110.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08301302790641785, "kl": 0.0038667218759655952, "learning_rate": 1.3496666666666668e-06, "loss": 0.0002, "num_tokens": 1758304.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 110.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.032326288521289825, "kl": 0.001139427360612899, "learning_rate": 1.3493333333333333e-06, "loss": 0.0001, "num_tokens": 1758624.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 110.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.03329334035515785, "kl": 0.002319994615390897, "learning_rate": 1.349e-06, "loss": 0.0001, "num_tokens": 1758884.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 110.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033758513163775206, "kl": 0.00046894027036614716, "learning_rate": 1.3486666666666667e-06, "loss": 0.0, "num_tokens": 1759146.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 110.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.006341130938380957, "kl": 0.0007953941822052002, "learning_rate": 1.3483333333333333e-06, "loss": 0.0, "num_tokens": 1759362.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 110.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 1.607242465019226, "kl": 0.16057805716991425, "learning_rate": 1.348e-06, "loss": 0.216, "num_tokens": 1759728.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 5957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 5.060657024383545, "kl": 0.027550682425498962, "learning_rate": 1.3476666666666668e-06, "loss": 0.117, "num_tokens": 1760014.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 110.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04181768372654915, "kl": 0.01945724617689848, "learning_rate": 1.3473333333333334e-06, "loss": 0.001, "num_tokens": 1760377.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02010614238679409, "kl": 0.002240018220618367, "learning_rate": 1.347e-06, "loss": 0.0001, "num_tokens": 1760661.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 110.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03157370537519455, "kl": 0.0025909217074513435, "learning_rate": 1.3466666666666668e-06, "loss": 0.0001, "num_tokens": 1760973.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03155425190925598, "kl": 0.010468210093677044, "learning_rate": 1.3463333333333333e-06, "loss": 0.0005, "num_tokens": 1761245.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 110.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08448175340890884, "kl": 0.013919135555624962, "learning_rate": 1.346e-06, "loss": 0.0008, "num_tokens": 1761513.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 110.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008087906171567738, "kl": 0.003783509135246277, "learning_rate": 1.3456666666666667e-06, "loss": 0.0002, "num_tokens": 1761749.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 110.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04153326526284218, "kl": 0.0037280984688550234, "learning_rate": 1.3453333333333333e-06, "loss": 0.0002, "num_tokens": 1762021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 110.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0775628536939621, "kl": 0.008935235207900405, "learning_rate": 1.345e-06, "loss": 0.0004, "num_tokens": 1762350.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 110.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.3141268491744995, "kl": 0.024234028678620234, "learning_rate": 1.3446666666666668e-06, "loss": 0.0012, "num_tokens": 1762600.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 110.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.017690027132630348, "kl": 0.0002167165366699919, "learning_rate": 1.3443333333333334e-06, "loss": 0.0, "num_tokens": 1762856.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 110.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009021191857755184, "kl": 0.0004621073603630066, "learning_rate": 1.344e-06, "loss": 0.0, "num_tokens": 1763062.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 110.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.00440799817442894, "kl": 0.2682085633277893, "learning_rate": 1.3436666666666667e-06, "loss": 0.0134, "num_tokens": 1763366.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 110.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04712720587849617, "kl": 0.03256791643798351, "learning_rate": 1.3433333333333333e-06, "loss": 0.0016, "num_tokens": 1763666.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 110.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04081154614686966, "kl": 0.004494331777095795, "learning_rate": 1.343e-06, "loss": 0.0002, "num_tokens": 1763882.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 110.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1078043282032013, "kl": 0.01819766405969858, "learning_rate": 1.3426666666666667e-06, "loss": 0.0009, "num_tokens": 1764210.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 110.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08867350965738297, "kl": 0.0025535154854878783, "learning_rate": 1.3423333333333332e-06, "loss": 0.0001, "num_tokens": 1764480.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 110.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.20398961007595062, "kl": 0.09276177920401096, "learning_rate": 1.342e-06, "loss": 0.0048, "num_tokens": 1764853.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 110.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.055068567395210266, "kl": 0.1615331843495369, "learning_rate": 1.3416666666666668e-06, "loss": 0.0081, "num_tokens": 1765163.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 110.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.1508154571056366, "kl": 0.03673155512660742, "learning_rate": 1.3413333333333334e-06, "loss": 0.0018, "num_tokens": 1765462.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 110.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.15242016315460205, "kl": 0.04679614119231701, "learning_rate": 1.3410000000000002e-06, "loss": 0.0022, "num_tokens": 1765776.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 110.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.258967399597168, "kl": 0.06232669949531555, "learning_rate": 1.3406666666666667e-06, "loss": 0.1485, "num_tokens": 1766135.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 5979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 110.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.08936386555433273, "kl": 0.004410826601088047, "learning_rate": 1.3403333333333333e-06, "loss": 0.0002, "num_tokens": 1766368.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 110.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.06297900527715683, "kl": 0.0019434280693531036, "learning_rate": 1.34e-06, "loss": 0.0001, "num_tokens": 1766628.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 110.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.1812336444854736, "kl": 0.03287340234965086, "learning_rate": 1.3396666666666667e-06, "loss": 0.0081, "num_tokens": 1766932.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 5982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.23258471488952637, "kl": 0.0386712783947587, "learning_rate": 1.3393333333333332e-06, "loss": 0.0017, "num_tokens": 1767230.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 110.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.12552443146705627, "kl": 0.010302982293069363, "learning_rate": 1.339e-06, "loss": 0.0005, "num_tokens": 1767520.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.3221331238746643, "kl": 0.045488059520721436, "learning_rate": 1.3386666666666668e-06, "loss": 0.0023, "num_tokens": 1767790.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02133364975452423, "kl": 0.03848847094923258, "learning_rate": 1.3383333333333334e-06, "loss": 0.002, "num_tokens": 1768082.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 110.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.040582749992609024, "kl": 0.008734130766242743, "learning_rate": 1.3380000000000001e-06, "loss": 0.0004, "num_tokens": 1768421.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 110.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05048838257789612, "kl": 0.017860619351267815, "learning_rate": 1.3376666666666667e-06, "loss": 0.0007, "num_tokens": 1768748.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 110.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.376307487487793, "kl": 0.016093899495899677, "learning_rate": 1.3373333333333333e-06, "loss": 0.0145, "num_tokens": 1769038.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 110.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.019687240943312645, "kl": 0.0004957199125783518, "learning_rate": 1.337e-06, "loss": 0.0, "num_tokens": 1769251.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 110.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.16235597431659698, "kl": 0.0057592743542045355, "learning_rate": 1.3366666666666666e-06, "loss": 0.0003, "num_tokens": 1769527.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 110.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03277580440044403, "kl": 0.00822208309546113, "learning_rate": 1.3363333333333332e-06, "loss": 0.0004, "num_tokens": 1769787.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 110.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02874281443655491, "kl": 0.0011245176574448124, "learning_rate": 1.3360000000000002e-06, "loss": 0.0001, "num_tokens": 1770057.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 111.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06061610206961632, "kl": 0.003574213129468262, "learning_rate": 1.3356666666666668e-06, "loss": 0.0002, "num_tokens": 1770357.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 111.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.414045333862305, "kl": 0.0259128431789577, "learning_rate": 1.3353333333333334e-06, "loss": -0.0345, "num_tokens": 1770633.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 111.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04092200845479965, "kl": 0.02369754295796156, "learning_rate": 1.3350000000000001e-06, "loss": 0.0012, "num_tokens": 1770985.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 111.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.29867076873779297, "kl": 0.05115535855293274, "learning_rate": 1.3346666666666667e-06, "loss": 0.0025, "num_tokens": 1771303.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 111.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00025144696701318026, "kl": 1.0229647159576416e-05, "learning_rate": 1.3343333333333333e-06, "loss": 0.0, "num_tokens": 1771523.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 111.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.0722994804382324, "kl": 0.11085102520883083, "learning_rate": 1.334e-06, "loss": -0.0576, "num_tokens": 1771859.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 5999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 111.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.019805828109383583, "kl": 0.0006284096743911505, "learning_rate": 1.3336666666666666e-06, "loss": 0.0, "num_tokens": 1772094.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 111.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03412608802318573, "kl": 0.0025367558700963855, "learning_rate": 1.3333333333333332e-06, "loss": 0.0001, "num_tokens": 1772354.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 111.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.009219389408826828, "kl": 0.0018447404727339745, "learning_rate": 1.3330000000000002e-06, "loss": 0.0001, "num_tokens": 1772666.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 111.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.3457442820072174, "kl": 0.036103968508541584, "learning_rate": 1.3326666666666668e-06, "loss": 0.0018, "num_tokens": 1772972.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 111.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.01878589764237404, "kl": 0.0073223109357059, "learning_rate": 1.3323333333333333e-06, "loss": 0.0004, "num_tokens": 1773258.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 111.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.3248395025730133, "kl": 0.04220906086266041, "learning_rate": 1.3320000000000001e-06, "loss": 0.0023, "num_tokens": 1773631.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 111.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.044450752437114716, "kl": 0.00617857207544148, "learning_rate": 1.3316666666666667e-06, "loss": 0.0003, "num_tokens": 1773920.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 111.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.024042680859565735, "kl": 0.007981546688824892, "learning_rate": 1.3313333333333333e-06, "loss": 0.0004, "num_tokens": 1774224.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 111.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.03342939540743828, "kl": 0.0013586650602519512, "learning_rate": 1.331e-06, "loss": 0.0001, "num_tokens": 1774496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 111.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.3270973861217499, "kl": 0.03668313066009432, "learning_rate": 1.3306666666666666e-06, "loss": 0.0017, "num_tokens": 1774765.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 111.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.09173467010259628, "kl": 0.02035329419595655, "learning_rate": 1.3303333333333332e-06, "loss": 0.0011, "num_tokens": 1775051.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 111.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.014538324438035488, "kl": 0.09772773459553719, "learning_rate": 1.3300000000000002e-06, "loss": 0.0049, "num_tokens": 1775423.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 111.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.10422306507825851, "kl": 0.06520166248083115, "learning_rate": 1.3296666666666668e-06, "loss": 0.0033, "num_tokens": 1775802.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 111.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.19651436805725098, "kl": 0.08215455524623394, "learning_rate": 1.3293333333333333e-06, "loss": 0.0041, "num_tokens": 1776170.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 111.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06956064701080322, "kl": 0.016023690346628428, "learning_rate": 1.3290000000000001e-06, "loss": 0.0008, "num_tokens": 1776431.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 111.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.4350983798503876, "kl": 0.058666869066655636, "learning_rate": 1.3286666666666667e-06, "loss": 0.003, "num_tokens": 1776704.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 111.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02888704463839531, "kl": 0.0004631221236195415, "learning_rate": 1.3283333333333333e-06, "loss": 0.0, "num_tokens": 1776917.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 111.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.12169242650270462, "kl": 0.026958446018397808, "learning_rate": 1.328e-06, "loss": 0.0013, "num_tokens": 1777213.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 111.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06427167356014252, "kl": 0.006387921050190926, "learning_rate": 1.3276666666666666e-06, "loss": 0.0003, "num_tokens": 1777508.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 111.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.14414052665233612, "kl": 0.00976718142919708, "learning_rate": 1.3273333333333334e-06, "loss": 0.0004, "num_tokens": 1777766.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 111.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.021128715947270393, "kl": 0.004333413438871503, "learning_rate": 1.3270000000000002e-06, "loss": 0.0002, "num_tokens": 1778096.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 111.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02932870388031006, "kl": 0.0011645738850347698, "learning_rate": 1.3266666666666667e-06, "loss": 0.0001, "num_tokens": 1778366.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 111.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.9630263447761536, "kl": 0.18346747010946274, "learning_rate": 1.3263333333333333e-06, "loss": 0.0094, "num_tokens": 1778714.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 111.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.010983824729919434, "kl": 0.001294062938541174, "learning_rate": 1.326e-06, "loss": 0.0001, "num_tokens": 1779010.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 111.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0399341881275177, "kl": 0.0019307732582092285, "learning_rate": 1.3256666666666667e-06, "loss": 0.0001, "num_tokens": 1779218.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 111.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06619943678379059, "kl": 0.011385556310415268, "learning_rate": 1.3253333333333332e-06, "loss": 0.0006, "num_tokens": 1779549.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 111.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.000785927870310843, "kl": 0.0037786588072776794, "learning_rate": 1.325e-06, "loss": 0.0002, "num_tokens": 1779785.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 111.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.09970193356275558, "kl": 0.006901193235535175, "learning_rate": 1.3246666666666666e-06, "loss": 0.0003, "num_tokens": 1780051.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 111.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05055214837193489, "kl": 0.011935213580727577, "learning_rate": 1.3243333333333334e-06, "loss": 0.0006, "num_tokens": 1780374.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 111.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.17069141566753387, "kl": 0.013828654307872057, "learning_rate": 1.3240000000000002e-06, "loss": 0.0006, "num_tokens": 1780646.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 111.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.6104181408882141, "kl": 0.040911171585321426, "learning_rate": 1.3236666666666667e-06, "loss": 0.0025, "num_tokens": 1780895.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 111.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.005884718149900436, "kl": 0.00035889819264411926, "learning_rate": 1.3233333333333333e-06, "loss": 0.0, "num_tokens": 1781155.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 111.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06621158123016357, "kl": 0.001562038087286055, "learning_rate": 1.323e-06, "loss": 0.0001, "num_tokens": 1781411.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 111.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.2523360252380371, "kl": 0.29302704334259033, "learning_rate": 1.3226666666666667e-06, "loss": 0.0147, "num_tokens": 1781716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 111.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.022643107920885086, "kl": 0.002543122856877744, "learning_rate": 1.3223333333333334e-06, "loss": 0.0001, "num_tokens": 1781995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 111.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.21131780743598938, "kl": 0.012039016000926495, "learning_rate": 1.322e-06, "loss": 0.0007, "num_tokens": 1782216.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 111.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07311871647834778, "kl": 0.008500882424414158, "learning_rate": 1.3216666666666666e-06, "loss": 0.0004, "num_tokens": 1782484.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 111.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.022241367027163506, "kl": 0.0029014392057433724, "learning_rate": 1.3213333333333334e-06, "loss": 0.0001, "num_tokens": 1782790.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 111.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.3976516723632812, "kl": 0.5538979358971119, "learning_rate": 1.3210000000000001e-06, "loss": 0.0091, "num_tokens": 1783192.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 6038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 111.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.019791917875409126, "kl": 0.002384623629041016, "learning_rate": 1.3206666666666667e-06, "loss": 0.0001, "num_tokens": 1783476.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 111.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0845181792974472, "kl": 0.03165253438055515, "learning_rate": 1.3203333333333335e-06, "loss": 0.0016, "num_tokens": 1783791.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 111.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435367912054062, "kl": 0.011215799488127232, "learning_rate": 1.32e-06, "loss": 0.0007, "num_tokens": 1784010.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 111.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.027536695823073387, "kl": 0.0017975717782974243, "learning_rate": 1.3196666666666666e-06, "loss": 0.0001, "num_tokens": 1784222.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 111.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.011715343222022057, "kl": 0.0004479595518205315, "learning_rate": 1.3193333333333334e-06, "loss": 0.0, "num_tokens": 1784539.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 111.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 7.094869613647461, "kl": 0.017732942011207342, "learning_rate": 1.319e-06, "loss": 0.2422, "num_tokens": 1784816.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 6044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 111.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.30865153670310974, "kl": 0.204693503677845, "learning_rate": 1.3186666666666666e-06, "loss": 0.0102, "num_tokens": 1785124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 111.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.185741901397705, "kl": 0.23800479620695114, "learning_rate": 1.3183333333333333e-06, "loss": 0.1464, "num_tokens": 1785437.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 111.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.094146728515625, "kl": 0.0469056311994791, "learning_rate": 1.3180000000000001e-06, "loss": 0.0487, "num_tokens": 1785764.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 112.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.005997946485877037, "kl": 0.0006072355608921498, "learning_rate": 1.3176666666666667e-06, "loss": 0.0, "num_tokens": 1786076.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 112.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07137975096702576, "kl": 0.16442856192588806, "learning_rate": 1.3173333333333335e-06, "loss": 0.0082, "num_tokens": 1786385.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 112.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0800158903002739, "kl": 0.013670348562300205, "learning_rate": 1.317e-06, "loss": 0.0007, "num_tokens": 1786645.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 112.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.00018708205607254058, "kl": 6.757676601409912e-06, "learning_rate": 1.3166666666666666e-06, "loss": 0.0, "num_tokens": 1786865.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 112.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00198156270198524, "kl": 9.253621101379395e-05, "learning_rate": 1.3163333333333334e-06, "loss": 0.0, "num_tokens": 1787108.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 112.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.004433844238519669, "kl": 0.0003328956663608551, "learning_rate": 1.316e-06, "loss": 0.0, "num_tokens": 1787368.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 112.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01701241359114647, "kl": 0.0007004120707279071, "learning_rate": 1.3156666666666665e-06, "loss": 0.0, "num_tokens": 1787601.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 112.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796699166297913, "kl": 0.01005538646131754, "learning_rate": 1.3153333333333335e-06, "loss": 0.0006, "num_tokens": 1787820.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 112.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.061234571039676666, "kl": 0.011406462639570236, "learning_rate": 1.3150000000000001e-06, "loss": 0.0006, "num_tokens": 1788151.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 112.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.004983628634363413, "kl": 4.785507917404175e-05, "learning_rate": 1.3146666666666667e-06, "loss": 0.0, "num_tokens": 1788363.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 112.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08072728663682938, "kl": 0.05556000769138336, "learning_rate": 1.3143333333333335e-06, "loss": 0.0028, "num_tokens": 1788657.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 112.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.05396899953484535, "kl": 0.010494334623217583, "learning_rate": 1.314e-06, "loss": 0.0005, "num_tokens": 1789007.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 112.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.543977737426758, "kl": 0.12582920491695404, "learning_rate": 1.3136666666666666e-06, "loss": -0.0717, "num_tokens": 1789372.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 6060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 112.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009092212421819568, "kl": 0.0037513896822929382, "learning_rate": 1.3133333333333334e-06, "loss": 0.0002, "num_tokens": 1789608.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 112.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.5136048197746277, "kl": 0.09922398626804352, "learning_rate": 1.313e-06, "loss": 0.005, "num_tokens": 1789980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 112.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06007806956768036, "kl": 0.025501039810478687, "learning_rate": 1.3126666666666665e-06, "loss": 0.0013, "num_tokens": 1790340.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 112.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.05802278593182564, "kl": 0.008447068918030709, "learning_rate": 1.3123333333333335e-06, "loss": 0.0004, "num_tokens": 1790608.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 112.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03410980477929115, "kl": 0.005437546409666538, "learning_rate": 1.312e-06, "loss": 0.0003, "num_tokens": 1790896.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.564185380935669, "kl": 0.15899060387164354, "learning_rate": 1.3116666666666667e-06, "loss": -0.0286, "num_tokens": 1791185.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 112.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.039753057062625885, "kl": 0.0014929691096767783, "learning_rate": 1.3113333333333335e-06, "loss": 0.0001, "num_tokens": 1791507.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 112.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.200249671936035, "kl": 0.02246608305722475, "learning_rate": 1.311e-06, "loss": 0.1129, "num_tokens": 1791881.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 112.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04710720106959343, "kl": 0.0030373672489076853, "learning_rate": 1.3106666666666666e-06, "loss": 0.0002, "num_tokens": 1792177.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10301186889410019, "kl": 0.016376479528844357, "learning_rate": 1.3103333333333334e-06, "loss": 0.0008, "num_tokens": 1792473.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 112.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03799111023545265, "kl": 0.0005753666337113827, "learning_rate": 1.31e-06, "loss": 0.0, "num_tokens": 1792729.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 112.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.1706431359052658, "kl": 0.04853484034538269, "learning_rate": 1.3096666666666665e-06, "loss": 0.0024, "num_tokens": 1793029.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 112.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.545912742614746, "kl": 0.23079244047403336, "learning_rate": 1.3093333333333335e-06, "loss": 0.0285, "num_tokens": 1793362.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.20008555054664612, "kl": 0.029866354539990425, "learning_rate": 1.309e-06, "loss": 0.0018, "num_tokens": 1793654.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 112.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007798391743563116, "kl": 0.0012585946824401617, "learning_rate": 1.3086666666666667e-06, "loss": 0.0001, "num_tokens": 1793934.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 112.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09289143979549408, "kl": 0.013444689102470875, "learning_rate": 1.3083333333333334e-06, "loss": 0.0007, "num_tokens": 1794274.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 112.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 7.097061634063721, "kl": 0.01800231065135449, "learning_rate": 1.308e-06, "loss": 0.0549, "num_tokens": 1794550.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 112.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0736713632941246, "kl": 0.005710856756195426, "learning_rate": 1.3076666666666666e-06, "loss": 0.0003, "num_tokens": 1794814.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 112.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.041961751878261566, "kl": 0.0032766188960522413, "learning_rate": 1.3073333333333334e-06, "loss": 0.0002, "num_tokens": 1795144.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 112.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.2190539836883545, "kl": 0.014457188313826919, "learning_rate": 1.307e-06, "loss": 0.0532, "num_tokens": 1795481.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 112.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.037971947342157364, "kl": 0.006282295798882842, "learning_rate": 1.3066666666666667e-06, "loss": 0.0003, "num_tokens": 1795749.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 112.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.4320151805877686, "kl": 0.07949055475182831, "learning_rate": 1.3063333333333335e-06, "loss": 0.0763, "num_tokens": 1796012.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 112.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05311053991317749, "kl": 0.0035557467490434647, "learning_rate": 1.306e-06, "loss": 0.0002, "num_tokens": 1796326.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 112.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.467212200164795, "kl": 0.028736325912177563, "learning_rate": 1.3056666666666666e-06, "loss": 0.0267, "num_tokens": 1796675.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 112.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.014098790474236012, "kl": 0.09716768935322762, "learning_rate": 1.3053333333333334e-06, "loss": 0.0049, "num_tokens": 1797047.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 112.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.07768841087818146, "kl": 0.052976781502366066, "learning_rate": 1.305e-06, "loss": 0.0026, "num_tokens": 1797319.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 112.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04761912301182747, "kl": 0.0012773325142916292, "learning_rate": 1.3046666666666666e-06, "loss": 0.0, "num_tokens": 1797539.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 112.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.13366056978702545, "kl": 0.02104736864566803, "learning_rate": 1.3043333333333334e-06, "loss": 0.0011, "num_tokens": 1797849.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 112.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 4.6834845542907715, "kl": 0.2582798183429986, "learning_rate": 1.304e-06, "loss": 0.0091, "num_tokens": 1798117.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 6089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 112.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 0.7414571046829224, "kl": 0.22417202312499285, "learning_rate": 1.3036666666666667e-06, "loss": 0.0267, "num_tokens": 1798524.0, "reward": 2.25, "reward_std": 1.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.5, "step": 6090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.035348184406757355, "kl": 0.00517903221771121, "learning_rate": 1.3033333333333335e-06, "loss": 0.0002, "num_tokens": 1798824.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 112.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0469917356967926, "kl": 0.001685740160610294, "learning_rate": 1.303e-06, "loss": 0.0001, "num_tokens": 1799096.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 112.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 5.0281081199646, "kl": 0.04458103032084182, "learning_rate": 1.3026666666666666e-06, "loss": 0.053, "num_tokens": 1799317.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 112.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.022530920803546906, "kl": 0.0019829481607303023, "learning_rate": 1.3023333333333334e-06, "loss": 0.0001, "num_tokens": 1799577.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 112.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005209250375628471, "kl": 0.26801739633083344, "learning_rate": 1.302e-06, "loss": 0.0134, "num_tokens": 1799881.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 112.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.020403945818543434, "kl": 0.0027150855166837573, "learning_rate": 1.3016666666666668e-06, "loss": 0.0001, "num_tokens": 1800163.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 112.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00602941308170557, "kl": 0.000691894005285576, "learning_rate": 1.3013333333333333e-06, "loss": 0.0, "num_tokens": 1800475.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 112.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03966996446251869, "kl": 0.0006224736571311951, "learning_rate": 1.301e-06, "loss": 0.0, "num_tokens": 1800685.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 112.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.057423096150159836, "kl": 0.013602379709482193, "learning_rate": 1.3006666666666667e-06, "loss": 0.0007, "num_tokens": 1800987.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 112.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.050531454384326935, "kl": 0.005418589920736849, "learning_rate": 1.3003333333333335e-06, "loss": 0.0003, "num_tokens": 1801249.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04474520683288574, "kl": 0.008686012821272016, "learning_rate": 1.3e-06, "loss": 0.0004, "num_tokens": 1801529.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 113.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.15142762660980225, "kl": 0.04436723701655865, "learning_rate": 1.2996666666666668e-06, "loss": 0.0023, "num_tokens": 1801849.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 113.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02884383127093315, "kl": 0.0018276572227478027, "learning_rate": 1.2993333333333334e-06, "loss": 0.0001, "num_tokens": 1802061.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 113.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.2334437370300293, "kl": 0.2099862964823842, "learning_rate": 1.299e-06, "loss": 0.0022, "num_tokens": 1802408.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 6104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 113.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.07001669704914093, "kl": 0.026877841912209988, "learning_rate": 1.2986666666666668e-06, "loss": 0.0015, "num_tokens": 1802698.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 113.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.053630854934453964, "kl": 0.004849277785979211, "learning_rate": 1.2983333333333333e-06, "loss": 0.0003, "num_tokens": 1802971.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 113.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05525628849864006, "kl": 0.04509176127612591, "learning_rate": 1.298e-06, "loss": 0.0023, "num_tokens": 1803375.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 113.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0272380281239748, "kl": 0.004748962353914976, "learning_rate": 1.2976666666666667e-06, "loss": 0.0002, "num_tokens": 1803643.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 113.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06247115880250931, "kl": 0.002168981940485537, "learning_rate": 1.2973333333333335e-06, "loss": 0.0001, "num_tokens": 1803939.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 113.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03162141516804695, "kl": 0.0010609924793243408, "learning_rate": 1.297e-06, "loss": 0.0, "num_tokens": 1804145.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 113.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.16321514546871185, "kl": 0.01150759935262613, "learning_rate": 1.2966666666666668e-06, "loss": 0.0007, "num_tokens": 1804385.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 113.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.046190064400434494, "kl": 0.03093954734504223, "learning_rate": 1.2963333333333334e-06, "loss": 0.0015, "num_tokens": 1804729.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 113.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.3360211253166199, "kl": 0.030782817862927914, "learning_rate": 1.296e-06, "loss": 0.0017, "num_tokens": 1804994.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 113.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.5796241760253906, "kl": 0.09935360588133335, "learning_rate": 1.2956666666666667e-06, "loss": 0.0051, "num_tokens": 1805293.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 113.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.017644623294472694, "kl": 0.002459119656123221, "learning_rate": 1.2953333333333333e-06, "loss": 0.0001, "num_tokens": 1805575.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 113.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419870465993881, "kl": 0.16157615184783936, "learning_rate": 1.2949999999999999e-06, "loss": 0.0081, "num_tokens": 1805885.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 113.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.21626430749893188, "kl": 0.029798878356814384, "learning_rate": 1.2946666666666669e-06, "loss": 0.0014, "num_tokens": 1806231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 113.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0877583920955658, "kl": 0.012825872283428907, "learning_rate": 1.2943333333333334e-06, "loss": 0.0006, "num_tokens": 1806522.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 113.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02765658311545849, "kl": 0.0002564266324043274, "learning_rate": 1.294e-06, "loss": 0.0, "num_tokens": 1806734.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 113.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.003085933392867446, "kl": 0.00047844648361206055, "learning_rate": 1.2936666666666668e-06, "loss": 0.0, "num_tokens": 1806994.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 113.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1523253470659256, "kl": 0.04721454158425331, "learning_rate": 1.2933333333333334e-06, "loss": 0.0024, "num_tokens": 1807265.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 113.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.27010905742645264, "kl": 0.033112211152911186, "learning_rate": 1.293e-06, "loss": 0.002, "num_tokens": 1807541.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 113.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.6834784150123596, "kl": 0.060694653540849686, "learning_rate": 1.2926666666666667e-06, "loss": 0.0033, "num_tokens": 1807803.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 113.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007319572032429278, "kl": 0.0037931501865386963, "learning_rate": 1.2923333333333333e-06, "loss": 0.0002, "num_tokens": 1808039.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 113.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01870492659509182, "kl": 0.0021220995113253593, "learning_rate": 1.2919999999999999e-06, "loss": 0.0001, "num_tokens": 1808316.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 113.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05359676480293274, "kl": 0.005423239199444652, "learning_rate": 1.2916666666666669e-06, "loss": 0.0003, "num_tokens": 1808604.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 113.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.018162034451961517, "kl": 0.0004427611784194596, "learning_rate": 1.2913333333333334e-06, "loss": 0.0, "num_tokens": 1808874.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 113.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08776697516441345, "kl": 0.013297136407345533, "learning_rate": 1.291e-06, "loss": 0.0006, "num_tokens": 1809167.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 113.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.233470916748047, "kl": 0.04257943370612338, "learning_rate": 1.2906666666666668e-06, "loss": 0.0267, "num_tokens": 1809495.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 113.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.021943939849734306, "kl": 0.0002798199711833149, "learning_rate": 1.2903333333333334e-06, "loss": 0.0, "num_tokens": 1809751.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 113.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 6.470475673675537, "kl": 0.4380806051194668, "learning_rate": 1.29e-06, "loss": 0.0239, "num_tokens": 1810055.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 113.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.009376843459904194, "kl": 0.009066774509847164, "learning_rate": 1.2896666666666667e-06, "loss": 0.0005, "num_tokens": 1810327.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 113.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04478984698653221, "kl": 0.0022650300234090537, "learning_rate": 1.2893333333333333e-06, "loss": 0.0001, "num_tokens": 1810641.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 113.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.146836519241333, "kl": 0.012147336732596159, "learning_rate": 1.2889999999999999e-06, "loss": 0.0006, "num_tokens": 1810906.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 113.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.020878897979855537, "kl": 0.01223933044821024, "learning_rate": 1.2886666666666669e-06, "loss": 0.0006, "num_tokens": 1811166.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 113.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10959409177303314, "kl": 0.031592690385878086, "learning_rate": 1.2883333333333334e-06, "loss": 0.0017, "num_tokens": 1811489.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 113.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435224711894989, "kl": 0.009896425995975733, "learning_rate": 1.288e-06, "loss": 0.0005, "num_tokens": 1811793.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 113.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.005696768872439861, "kl": 0.267911359667778, "learning_rate": 1.2876666666666668e-06, "loss": 0.0134, "num_tokens": 1812097.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 113.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.29938679933547974, "kl": 0.027862831600941718, "learning_rate": 1.2873333333333333e-06, "loss": 0.0015, "num_tokens": 1812426.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 113.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.1182071641087532, "kl": 0.059394070878624916, "learning_rate": 1.287e-06, "loss": 0.003, "num_tokens": 1812766.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 113.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.00019236697698943317, "kl": 6.802380084991455e-06, "learning_rate": 1.2866666666666667e-06, "loss": 0.0, "num_tokens": 1812986.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 113.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.15949071943759918, "kl": 0.022955283522605896, "learning_rate": 1.2863333333333333e-06, "loss": 0.0012, "num_tokens": 1813284.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 113.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.006175547372549772, "kl": 0.0015174001455307007, "learning_rate": 1.286e-06, "loss": 0.0001, "num_tokens": 1813500.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 113.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.056358322501182556, "kl": 0.003626002697274089, "learning_rate": 1.2856666666666668e-06, "loss": 0.0002, "num_tokens": 1813829.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 113.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.003910984843969345, "kl": 0.00014029815793037415, "learning_rate": 1.2853333333333334e-06, "loss": 0.0, "num_tokens": 1814073.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 113.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02486460469663143, "kl": 0.0015810569748282433, "learning_rate": 1.285e-06, "loss": 0.0001, "num_tokens": 1814385.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 113.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.014391753822565079, "kl": 0.09706833586096764, "learning_rate": 1.2846666666666668e-06, "loss": 0.0049, "num_tokens": 1814757.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 113.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.004465989302843809, "kl": 0.00036110280780121684, "learning_rate": 1.2843333333333333e-06, "loss": 0.0, "num_tokens": 1814977.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 113.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.31261640787124634, "kl": 0.07626931555569172, "learning_rate": 1.284e-06, "loss": 0.0041, "num_tokens": 1815366.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 113.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.3386521339416504, "kl": 0.041719175642356277, "learning_rate": 1.2836666666666667e-06, "loss": -0.0635, "num_tokens": 1815650.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 113.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.2824010848999023, "kl": 0.061978865414857864, "learning_rate": 1.2833333333333333e-06, "loss": -0.018, "num_tokens": 1816023.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 113.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.945875883102417, "kl": 0.11103121191263199, "learning_rate": 1.283e-06, "loss": -0.0392, "num_tokens": 1816403.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 113.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.032043103128671646, "kl": 0.001565319747896865, "learning_rate": 1.2826666666666668e-06, "loss": 0.0001, "num_tokens": 1816679.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 113.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04927992820739746, "kl": 0.010830877348780632, "learning_rate": 1.2823333333333334e-06, "loss": 0.0005, "num_tokens": 1816998.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 113.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.1662204265594482, "kl": 0.13761737197637558, "learning_rate": 1.282e-06, "loss": -0.076, "num_tokens": 1817339.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 6155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 114.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.5782065391540527, "kl": 0.5165721024386585, "learning_rate": 1.2816666666666667e-06, "loss": 0.1623, "num_tokens": 1817604.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 114.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.028663193807005882, "kl": 0.00029393285512924194, "learning_rate": 1.2813333333333333e-06, "loss": 0.0, "num_tokens": 1817816.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 114.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05983349308371544, "kl": 0.011977674905210733, "learning_rate": 1.281e-06, "loss": 0.0006, "num_tokens": 1818139.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 114.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.026303723454475403, "kl": 0.0024897477123886347, "learning_rate": 1.2806666666666667e-06, "loss": 0.0001, "num_tokens": 1818419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 114.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.039252396672964096, "kl": 0.0016693869838491082, "learning_rate": 1.2803333333333332e-06, "loss": 0.0001, "num_tokens": 1818687.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 114.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05376654863357544, "kl": 0.024469844065606594, "learning_rate": 1.28e-06, "loss": 0.0012, "num_tokens": 1819051.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 114.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05036499351263046, "kl": 0.006932688876986504, "learning_rate": 1.2796666666666668e-06, "loss": 0.0003, "num_tokens": 1819342.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 114.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004733308218419552, "kl": 0.0003312766639282927, "learning_rate": 1.2793333333333334e-06, "loss": 0.0, "num_tokens": 1819562.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 114.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0364217534661293, "kl": 0.004737072857096791, "learning_rate": 1.2790000000000002e-06, "loss": 0.0002, "num_tokens": 1819850.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 114.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.10991999506950378, "kl": 0.03192046098411083, "learning_rate": 1.2786666666666667e-06, "loss": 0.0016, "num_tokens": 1820178.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.023190563544631004, "kl": 0.002787799807265401, "learning_rate": 1.2783333333333333e-06, "loss": 0.0001, "num_tokens": 1820460.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 114.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 6.263566970825195, "kl": 0.08319773897528648, "learning_rate": 1.278e-06, "loss": -0.0329, "num_tokens": 1820762.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 114.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.015998879447579384, "kl": 0.003295719623565674, "learning_rate": 1.2776666666666667e-06, "loss": 0.0002, "num_tokens": 1821066.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 114.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.00020607894111890346, "kl": 8.180737495422363e-06, "learning_rate": 1.2773333333333332e-06, "loss": 0.0, "num_tokens": 1821286.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 114.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02200189046561718, "kl": 0.012004107236862183, "learning_rate": 1.277e-06, "loss": 0.0006, "num_tokens": 1821546.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 114.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.4950074851512909, "kl": 0.09659002721309662, "learning_rate": 1.2766666666666668e-06, "loss": 0.0048, "num_tokens": 1821818.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 114.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.24328814446926117, "kl": 0.04084146022796631, "learning_rate": 1.2763333333333334e-06, "loss": 0.0021, "num_tokens": 1822137.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 114.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.006257086992263794, "kl": 0.0018291417509317398, "learning_rate": 1.2760000000000001e-06, "loss": 0.0001, "num_tokens": 1822449.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 114.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.053064122796058655, "kl": 0.0031415367411682382, "learning_rate": 1.2756666666666667e-06, "loss": 0.0002, "num_tokens": 1822698.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.015413051471114159, "kl": 0.0016451667761430144, "learning_rate": 1.2753333333333333e-06, "loss": 0.0001, "num_tokens": 1822994.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 114.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.064087390899658, "kl": 0.2727709859609604, "learning_rate": 1.275e-06, "loss": 0.0001, "num_tokens": 1823297.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 114.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 1.6945916414260864, "kl": 0.11913560517132282, "learning_rate": 1.2746666666666666e-06, "loss": 0.006, "num_tokens": 1823557.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 114.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06170351058244705, "kl": 0.007386990590021014, "learning_rate": 1.2743333333333332e-06, "loss": 0.0004, "num_tokens": 1823860.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 114.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.19605782628059387, "kl": 0.019883667584508657, "learning_rate": 1.2740000000000002e-06, "loss": 0.0011, "num_tokens": 1824204.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 114.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03629698604345322, "kl": 0.008807201404124498, "learning_rate": 1.2736666666666668e-06, "loss": 0.0004, "num_tokens": 1824488.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.13570334017276764, "kl": 0.02256380021572113, "learning_rate": 1.2733333333333334e-06, "loss": 0.0014, "num_tokens": 1824766.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 114.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.176393985748291, "kl": 0.12488642707467079, "learning_rate": 1.2730000000000001e-06, "loss": 0.0306, "num_tokens": 1825143.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 114.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02194606512784958, "kl": 0.000545364135177806, "learning_rate": 1.2726666666666667e-06, "loss": 0.0, "num_tokens": 1825411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 114.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02322235144674778, "kl": 0.00403057795483619, "learning_rate": 1.2723333333333333e-06, "loss": 0.0002, "num_tokens": 1825741.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 114.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02036534622311592, "kl": 0.0009313449263572693, "learning_rate": 1.272e-06, "loss": 0.0, "num_tokens": 1826001.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 114.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.5926064848899841, "kl": 0.06984907109290361, "learning_rate": 1.2716666666666666e-06, "loss": 0.0038, "num_tokens": 1826325.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 114.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.16381967067718506, "kl": 0.11497493088245392, "learning_rate": 1.2713333333333332e-06, "loss": 0.0057, "num_tokens": 1826697.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03609957918524742, "kl": 0.003011562628671527, "learning_rate": 1.2710000000000002e-06, "loss": 0.0002, "num_tokens": 1826969.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 114.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.059980399906635284, "kl": 0.03339464124292135, "learning_rate": 1.2706666666666668e-06, "loss": 0.0017, "num_tokens": 1827295.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 114.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08144175261259079, "kl": 0.030467216856777668, "learning_rate": 1.2703333333333333e-06, "loss": 0.0015, "num_tokens": 1827605.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 114.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007697722758166492, "kl": 0.0037793144583702087, "learning_rate": 1.2700000000000001e-06, "loss": 0.0002, "num_tokens": 1827841.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 114.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.005531348753720522, "kl": 0.0004951953742420301, "learning_rate": 1.2696666666666667e-06, "loss": 0.0, "num_tokens": 1828101.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 114.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.09417324513196945, "kl": 0.009912568144500256, "learning_rate": 1.2693333333333333e-06, "loss": 0.0005, "num_tokens": 1828435.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 114.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.055066537111997604, "kl": 0.04799327440559864, "learning_rate": 1.269e-06, "loss": 0.0024, "num_tokens": 1828839.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 114.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.09334663301706314, "kl": 0.012137975078076124, "learning_rate": 1.2686666666666666e-06, "loss": 0.0006, "num_tokens": 1829130.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 114.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.01382592786103487, "kl": 0.0015658079646527767, "learning_rate": 1.2683333333333332e-06, "loss": 0.0001, "num_tokens": 1829458.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.3998969495296478, "kl": 0.03045146632939577, "learning_rate": 1.2680000000000002e-06, "loss": 0.0015, "num_tokens": 1829717.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 114.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.028034979477524757, "kl": 0.0395667664706707, "learning_rate": 1.2676666666666668e-06, "loss": 0.002, "num_tokens": 1830009.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 114.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.019692983478307724, "kl": 0.0013635685172630474, "learning_rate": 1.2673333333333333e-06, "loss": 0.0001, "num_tokens": 1830320.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 114.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.14744527637958527, "kl": 0.016954160062596202, "learning_rate": 1.2670000000000001e-06, "loss": 0.0008, "num_tokens": 1830580.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 114.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.291611433029175, "kl": 0.015495841391384602, "learning_rate": 1.2666666666666667e-06, "loss": -0.1883, "num_tokens": 1830868.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 6201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 114.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.014595337212085724, "kl": 0.0006506634672405198, "learning_rate": 1.2663333333333333e-06, "loss": 0.0, "num_tokens": 1831103.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 114.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.045938242226839066, "kl": 0.16250810772180557, "learning_rate": 1.266e-06, "loss": 0.0081, "num_tokens": 1831412.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 114.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.2536752223968506, "kl": 0.02076407801359892, "learning_rate": 1.2656666666666666e-06, "loss": 0.0009, "num_tokens": 1831678.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 114.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022744471207261086, "kl": 0.0013867318630218506, "learning_rate": 1.2653333333333334e-06, "loss": 0.0001, "num_tokens": 1831894.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 114.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01388757023960352, "kl": 0.00033307820558547974, "learning_rate": 1.2650000000000002e-06, "loss": 0.0, "num_tokens": 1832102.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 114.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.027198635041713715, "kl": 0.000445952988229692, "learning_rate": 1.2646666666666667e-06, "loss": 0.0, "num_tokens": 1832358.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 114.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05515242740511894, "kl": 0.04820450767874718, "learning_rate": 1.2643333333333333e-06, "loss": 0.0024, "num_tokens": 1832726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 114.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.0941758155822754, "kl": 0.14915595948696136, "learning_rate": 1.264e-06, "loss": -0.0212, "num_tokens": 1833063.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 115.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.004696240648627281, "kl": 0.0011134495434816927, "learning_rate": 1.2636666666666667e-06, "loss": 0.0001, "num_tokens": 1833279.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 115.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.094312384724617, "kl": 0.04079294204711914, "learning_rate": 1.2633333333333332e-06, "loss": 0.002, "num_tokens": 1833553.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 115.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03498869761824608, "kl": 0.024879327043890953, "learning_rate": 1.263e-06, "loss": 0.0013, "num_tokens": 1833915.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 115.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.8750035762786865, "kl": 0.0767012257128954, "learning_rate": 1.2626666666666666e-06, "loss": -0.1228, "num_tokens": 1834275.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 115.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04909810796380043, "kl": 0.015334242023527622, "learning_rate": 1.2623333333333334e-06, "loss": 0.0008, "num_tokens": 1834585.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 115.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.007737711071968079, "kl": 0.007780902087688446, "learning_rate": 1.2620000000000002e-06, "loss": 0.0004, "num_tokens": 1834857.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 115.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.005330335348844528, "kl": 0.00037394398532342166, "learning_rate": 1.2616666666666667e-06, "loss": 0.0, "num_tokens": 1835171.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 115.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.021202364936470985, "kl": 0.0016954689635895193, "learning_rate": 1.2613333333333333e-06, "loss": 0.0001, "num_tokens": 1835431.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 115.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0466022789478302, "kl": 0.019457083195447922, "learning_rate": 1.261e-06, "loss": 0.001, "num_tokens": 1835793.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 115.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.009932305663824081, "kl": 0.0006368197500705719, "learning_rate": 1.2606666666666667e-06, "loss": 0.0, "num_tokens": 1836055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 115.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.012711938470602036, "kl": 0.0022020963951945305, "learning_rate": 1.2603333333333334e-06, "loss": 0.0001, "num_tokens": 1836367.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 115.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04842125251889229, "kl": 0.0027758406940847635, "learning_rate": 1.26e-06, "loss": 0.0001, "num_tokens": 1836695.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 115.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.7433775067329407, "kl": 0.03612457067356445, "learning_rate": 1.2596666666666666e-06, "loss": 0.0018, "num_tokens": 1836951.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 115.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012128106318414211, "kl": 8.714944124221802e-05, "learning_rate": 1.2593333333333334e-06, "loss": 0.0, "num_tokens": 1837195.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 115.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.12190528213977814, "kl": 0.058578457683324814, "learning_rate": 1.2590000000000001e-06, "loss": 0.0029, "num_tokens": 1837541.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 115.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06681372225284576, "kl": 0.004231396829709411, "learning_rate": 1.2586666666666667e-06, "loss": 0.0002, "num_tokens": 1837809.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 115.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.07929566502571106, "kl": 0.031787254847586155, "learning_rate": 1.2583333333333335e-06, "loss": 0.0014, "num_tokens": 1838167.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 115.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.04284188896417618, "kl": 0.04354099929332733, "learning_rate": 1.258e-06, "loss": 0.0022, "num_tokens": 1838572.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 115.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.1978859007358551, "kl": 0.019339651567861438, "learning_rate": 1.2576666666666666e-06, "loss": 0.0011, "num_tokens": 1838908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 115.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04125714674592018, "kl": 0.002634570235386491, "learning_rate": 1.2573333333333334e-06, "loss": 0.0001, "num_tokens": 1839180.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 115.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.22951935231685638, "kl": 0.024778328835964203, "learning_rate": 1.257e-06, "loss": 0.0013, "num_tokens": 1839472.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 115.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07451792061328888, "kl": 0.02749769389629364, "learning_rate": 1.2566666666666666e-06, "loss": 0.0014, "num_tokens": 1839774.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 115.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10316783934831619, "kl": 0.023797186091542244, "learning_rate": 1.2563333333333333e-06, "loss": 0.0012, "num_tokens": 1840048.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 115.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05592063441872597, "kl": 0.005200710846111178, "learning_rate": 1.2560000000000001e-06, "loss": 0.0003, "num_tokens": 1840350.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 115.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.024762703105807304, "kl": 0.0011641234159469604, "learning_rate": 1.2556666666666667e-06, "loss": 0.0001, "num_tokens": 1840562.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 115.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 1.841092586517334, "kl": 0.16811568662524223, "learning_rate": 1.2553333333333335e-06, "loss": 0.0078, "num_tokens": 1840855.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 115.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0937301516532898, "kl": 0.042482590302824974, "learning_rate": 1.255e-06, "loss": 0.0021, "num_tokens": 1841192.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 115.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.173649311065674, "kl": 0.052505326457321644, "learning_rate": 1.2546666666666666e-06, "loss": 0.1775, "num_tokens": 1841466.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 6237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 115.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0060199410654604435, "kl": 0.2678922414779663, "learning_rate": 1.2543333333333334e-06, "loss": 0.0134, "num_tokens": 1841770.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 115.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.21355508267879486, "kl": 0.0890326090157032, "learning_rate": 1.254e-06, "loss": 0.0044, "num_tokens": 1842142.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 115.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.11880111694335938, "kl": 0.017414493719115853, "learning_rate": 1.2536666666666666e-06, "loss": 0.001, "num_tokens": 1842485.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 115.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.011011568829417229, "kl": 0.0006151107081677765, "learning_rate": 1.2533333333333335e-06, "loss": 0.0, "num_tokens": 1842720.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 115.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04526427760720253, "kl": 0.12831896916031837, "learning_rate": 1.2530000000000001e-06, "loss": 0.0065, "num_tokens": 1843029.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 115.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.018583742901682854, "kl": 0.012669486925005913, "learning_rate": 1.2526666666666667e-06, "loss": 0.0006, "num_tokens": 1843289.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 115.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.000533915008418262, "kl": 0.0012635865132324398, "learning_rate": 1.2523333333333335e-06, "loss": 0.0001, "num_tokens": 1843569.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 115.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.43353140354156494, "kl": 0.02571816649287939, "learning_rate": 1.252e-06, "loss": 0.002, "num_tokens": 1843805.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 115.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.5229374170303345, "kl": 0.030647223815321922, "learning_rate": 1.2516666666666666e-06, "loss": 0.0047, "num_tokens": 1844170.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 6246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 115.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.02296511083841324, "kl": 0.0036907498724758625, "learning_rate": 1.2513333333333334e-06, "loss": 0.0002, "num_tokens": 1844440.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 115.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.21778427064418793, "kl": 0.03142505802679807, "learning_rate": 1.251e-06, "loss": 0.0016, "num_tokens": 1844725.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 115.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.11077558994293213, "kl": 0.007567106746137142, "learning_rate": 1.2506666666666665e-06, "loss": 0.0004, "num_tokens": 1845023.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 115.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.1737356334924698, "kl": 0.0209151110611856, "learning_rate": 1.2503333333333335e-06, "loss": 0.001, "num_tokens": 1845315.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 115.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018107322975993156, "kl": 0.00026499107480049133, "learning_rate": 1.25e-06, "loss": 0.0, "num_tokens": 1845575.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 115.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05953364074230194, "kl": 0.003828287524811458, "learning_rate": 1.2496666666666667e-06, "loss": 0.0002, "num_tokens": 1845851.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 115.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007998199434950948, "kl": 0.0037676095962524414, "learning_rate": 1.2493333333333335e-06, "loss": 0.0002, "num_tokens": 1846087.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 115.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.012243026867508888, "kl": 0.09739917516708374, "learning_rate": 1.249e-06, "loss": 0.0049, "num_tokens": 1846459.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 115.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05324836075305939, "kl": 0.01230662316083908, "learning_rate": 1.2486666666666666e-06, "loss": 0.0006, "num_tokens": 1846786.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 115.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.6422905921936035, "kl": 0.07225701492279768, "learning_rate": 1.2483333333333334e-06, "loss": 0.0026, "num_tokens": 1847102.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 115.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014937584637664258, "kl": 4.7013163566589355e-06, "learning_rate": 1.248e-06, "loss": 0.0, "num_tokens": 1847322.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 115.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04233884811401367, "kl": 0.001320550829404965, "learning_rate": 1.2476666666666665e-06, "loss": 0.0001, "num_tokens": 1847590.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 115.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04144356772303581, "kl": 0.008482268545776606, "learning_rate": 1.2473333333333335e-06, "loss": 0.0004, "num_tokens": 1847872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 115.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.008734427392482758, "kl": 0.0006128549721324816, "learning_rate": 1.247e-06, "loss": 0.0, "num_tokens": 1848091.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 115.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.3464406132698059, "kl": 0.04284094646573067, "learning_rate": 1.2466666666666667e-06, "loss": 0.0024, "num_tokens": 1848302.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 115.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04640262573957443, "kl": 0.0011221036547794938, "learning_rate": 1.2463333333333334e-06, "loss": 0.0001, "num_tokens": 1848515.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 115.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09202194213867188, "kl": 0.009811186231672764, "learning_rate": 1.246e-06, "loss": 0.0005, "num_tokens": 1848808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 116.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03162007033824921, "kl": 0.015394420363008976, "learning_rate": 1.2456666666666666e-06, "loss": 0.0008, "num_tokens": 1849095.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 116.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1439974457025528, "kl": 0.0052751151961274445, "learning_rate": 1.2453333333333334e-06, "loss": 0.0003, "num_tokens": 1849351.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 116.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.15991061925888062, "kl": 0.042885567992925644, "learning_rate": 1.245e-06, "loss": 0.0019, "num_tokens": 1849644.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.75, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 116.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.141469955444336, "kl": 0.045004235580563545, "learning_rate": 1.2446666666666667e-06, "loss": 0.2184, "num_tokens": 1850047.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 6267 }, { "clip_ratio/high_max": 0.00909090880304575, "clip_ratio/high_mean": 0.00909090880304575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00909090880304575, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 116.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 4.942110061645508, "kl": 0.7526375111192465, "learning_rate": 1.2443333333333335e-06, "loss": 0.0343, "num_tokens": 1850406.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 6268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 116.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03768537566065788, "kl": 0.00553034245967865, "learning_rate": 1.244e-06, "loss": 0.0003, "num_tokens": 1850699.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07933907955884933, "kl": 0.004391956143081188, "learning_rate": 1.2436666666666666e-06, "loss": 0.0002, "num_tokens": 1850995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00708309980109334, "kl": 0.0014140590792521834, "learning_rate": 1.2433333333333334e-06, "loss": 0.0001, "num_tokens": 1851272.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 116.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0064334324561059475, "kl": 0.2678179293870926, "learning_rate": 1.243e-06, "loss": 0.0134, "num_tokens": 1851576.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 116.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.000789306708611548, "kl": 0.003765881061553955, "learning_rate": 1.2426666666666666e-06, "loss": 0.0002, "num_tokens": 1851812.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 116.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.020940933376550674, "kl": 0.012090378440916538, "learning_rate": 1.2423333333333334e-06, "loss": 0.0006, "num_tokens": 1852072.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 116.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.010890180245041847, "kl": 0.000506911426782608, "learning_rate": 1.242e-06, "loss": 0.0, "num_tokens": 1852332.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 116.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.9097113609313965, "kl": 0.03586278576403856, "learning_rate": 1.2416666666666667e-06, "loss": -0.0026, "num_tokens": 1852640.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 6276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.06783688068389893, "kl": 0.018612314481288195, "learning_rate": 1.2413333333333335e-06, "loss": 0.001, "num_tokens": 1852914.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 116.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.012533112429082394, "kl": 0.09734746441245079, "learning_rate": 1.241e-06, "loss": 0.0049, "num_tokens": 1853286.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02706988900899887, "kl": 0.001588658895343542, "learning_rate": 1.2406666666666666e-06, "loss": 0.0001, "num_tokens": 1853558.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 116.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.023116370663046837, "kl": 0.0013605743151856586, "learning_rate": 1.2403333333333334e-06, "loss": 0.0001, "num_tokens": 1853777.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 116.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.1760350912809372, "kl": 0.18322789669036865, "learning_rate": 1.24e-06, "loss": 0.0092, "num_tokens": 1854085.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 116.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04928117245435715, "kl": 0.009053825866430998, "learning_rate": 1.2396666666666668e-06, "loss": 0.0005, "num_tokens": 1854501.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 116.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05760377272963524, "kl": 0.015146711841225624, "learning_rate": 1.2393333333333333e-06, "loss": 0.0008, "num_tokens": 1854802.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 116.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.17278456687927246, "kl": 0.06444323062896729, "learning_rate": 1.239e-06, "loss": 0.0032, "num_tokens": 1855206.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 116.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04154883325099945, "kl": 0.001979883905733004, "learning_rate": 1.2386666666666667e-06, "loss": 0.0001, "num_tokens": 1855482.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 116.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09533293545246124, "kl": 0.01082206517457962, "learning_rate": 1.2383333333333335e-06, "loss": 0.0005, "num_tokens": 1855782.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 116.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.14209595322608948, "kl": 0.017969570588320494, "learning_rate": 1.238e-06, "loss": 0.0009, "num_tokens": 1856095.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 116.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.5980515480041504, "kl": 0.08496873347030487, "learning_rate": 1.2376666666666666e-06, "loss": 0.0045, "num_tokens": 1856339.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6288 }, { "clip_ratio/high_max": 0.006097560748457909, "clip_ratio/high_mean": 0.006097560748457909, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006097560748457909, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 116.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.7978053092956543, "kl": 0.08401504904031754, "learning_rate": 1.2373333333333334e-06, "loss": 0.0982, "num_tokens": 1856726.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 6289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 116.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0768980011343956, "kl": 0.007214951561763883, "learning_rate": 1.237e-06, "loss": 0.0004, "num_tokens": 1857056.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 116.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0632239505648613, "kl": 0.0033106408081948757, "learning_rate": 1.2366666666666668e-06, "loss": 0.0002, "num_tokens": 1857375.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 116.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.6029794216156006, "kl": 0.14259984344244003, "learning_rate": 1.2363333333333333e-06, "loss": 0.0641, "num_tokens": 1857768.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.514859199523926, "kl": 0.05777551420032978, "learning_rate": 1.236e-06, "loss": -0.0882, "num_tokens": 1858047.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 116.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02193385921418667, "kl": 0.001887825084850192, "learning_rate": 1.2356666666666667e-06, "loss": 0.0001, "num_tokens": 1858307.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 116.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0277364794164896, "kl": 0.0019534826278686523, "learning_rate": 1.2353333333333335e-06, "loss": 0.0001, "num_tokens": 1858519.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 116.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.028444884344935417, "kl": 0.004912725416943431, "learning_rate": 1.235e-06, "loss": 0.0002, "num_tokens": 1858807.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 116.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.23661844432353973, "kl": 0.022035363130271435, "learning_rate": 1.2346666666666668e-06, "loss": 0.0011, "num_tokens": 1859065.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 116.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10691499710083008, "kl": 0.021265359595417976, "learning_rate": 1.2343333333333334e-06, "loss": 0.001, "num_tokens": 1859364.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 116.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.598431348800659, "kl": 0.06902923434972763, "learning_rate": 1.234e-06, "loss": -0.0137, "num_tokens": 1859667.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 116.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.06887342035770416, "kl": 0.024940460920333862, "learning_rate": 1.2336666666666667e-06, "loss": 0.0013, "num_tokens": 1859991.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 116.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.17546403408050537, "kl": 0.013412305852398276, "learning_rate": 1.2333333333333333e-06, "loss": 0.0006, "num_tokens": 1860257.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 116.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.08546403050422668, "kl": 0.00312786060385406, "learning_rate": 1.2329999999999999e-06, "loss": 0.0002, "num_tokens": 1860467.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 116.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06565514951944351, "kl": 0.005158094922080636, "learning_rate": 1.2326666666666669e-06, "loss": 0.0003, "num_tokens": 1860798.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 116.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.006790246348828077, "kl": 0.00011561065912246704, "learning_rate": 1.2323333333333334e-06, "loss": 0.0, "num_tokens": 1861010.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 116.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.061141323298215866, "kl": 0.04211212135851383, "learning_rate": 1.232e-06, "loss": 0.0022, "num_tokens": 1861301.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 116.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 2.4490175247192383, "kl": 0.32804083079099655, "learning_rate": 1.2316666666666668e-06, "loss": 0.0182, "num_tokens": 1861522.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 116.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.791076898574829, "kl": 0.07484490307979286, "learning_rate": 1.2313333333333334e-06, "loss": 0.0023, "num_tokens": 1861788.0, "reward": 5.5, "reward_std": 3.316624879837036, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 3.316624879837036, "step": 6307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 116.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.015503501519560814, "kl": 0.0007985396514413878, "learning_rate": 1.231e-06, "loss": 0.0, "num_tokens": 1862097.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 116.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05024037882685661, "kl": 0.010773248039186, "learning_rate": 1.2306666666666667e-06, "loss": 0.0005, "num_tokens": 1862432.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 116.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.011990761384367943, "kl": 0.00043349614134058356, "learning_rate": 1.2303333333333333e-06, "loss": 0.0, "num_tokens": 1862702.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 116.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08177473396062851, "kl": 0.014754015253856778, "learning_rate": 1.2299999999999999e-06, "loss": 0.0008, "num_tokens": 1862986.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 116.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.004946709610521793, "kl": 0.00031425655470229685, "learning_rate": 1.2296666666666669e-06, "loss": 0.0, "num_tokens": 1863246.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 116.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.17845740914344788, "kl": 0.020642086397856474, "learning_rate": 1.2293333333333334e-06, "loss": 0.0011, "num_tokens": 1863538.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 116.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068844957277178764, "kl": 0.00207352451980114, "learning_rate": 1.229e-06, "loss": 0.0001, "num_tokens": 1863850.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 116.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.024111207574605942, "kl": 0.000682694575516507, "learning_rate": 1.2286666666666668e-06, "loss": 0.0, "num_tokens": 1864084.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.020938321948051453, "kl": 0.002724895952269435, "learning_rate": 1.2283333333333334e-06, "loss": 0.0001, "num_tokens": 1864366.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 116.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.6453166007995605, "kl": 0.02425048127770424, "learning_rate": 1.228e-06, "loss": -0.0214, "num_tokens": 1864684.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.11007286608219147, "kl": 0.0028574815951287746, "learning_rate": 1.2276666666666667e-06, "loss": 0.0002, "num_tokens": 1864911.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.015436398796737194, "kl": 0.0014515546499751508, "learning_rate": 1.2273333333333333e-06, "loss": 0.0001, "num_tokens": 1865179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 117.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.6589757204055786, "kl": 0.05297594587318599, "learning_rate": 1.2269999999999999e-06, "loss": 0.003, "num_tokens": 1865475.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 117.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.020373262465000153, "kl": 0.009699301328510046, "learning_rate": 1.2266666666666669e-06, "loss": 0.0005, "num_tokens": 1865769.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.059715352952480316, "kl": 0.0029501643730327487, "learning_rate": 1.2263333333333334e-06, "loss": 0.0001, "num_tokens": 1866065.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 117.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.0689995288848877, "kl": 0.4349171072244644, "learning_rate": 1.226e-06, "loss": 0.0377, "num_tokens": 1866439.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 117.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004212557338178158, "kl": 0.0002928256872110069, "learning_rate": 1.2256666666666668e-06, "loss": 0.0, "num_tokens": 1866659.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 117.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.061383381485939026, "kl": 0.0010862275958061218, "learning_rate": 1.2253333333333333e-06, "loss": 0.0001, "num_tokens": 1866869.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 117.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.005569239612668753, "kl": 0.0006695190968457609, "learning_rate": 1.225e-06, "loss": 0.0, "num_tokens": 1867085.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.04014762118458748, "kl": 0.0036010071635246277, "learning_rate": 1.2246666666666667e-06, "loss": 0.0002, "num_tokens": 1867301.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 117.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.12944601476192474, "kl": 0.03632536344230175, "learning_rate": 1.2243333333333333e-06, "loss": 0.0018, "num_tokens": 1867572.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 117.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.012796190567314625, "kl": 0.00048668310046195984, "learning_rate": 1.224e-06, "loss": 0.0, "num_tokens": 1867832.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.5973250865936279, "kl": 0.04203704744577408, "learning_rate": 1.2236666666666668e-06, "loss": 0.0021, "num_tokens": 1868068.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 117.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.024066155776381493, "kl": 0.000497678731335327, "learning_rate": 1.2233333333333334e-06, "loss": 0.0, "num_tokens": 1868317.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 117.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.007013546768575907, "kl": 0.26768939197063446, "learning_rate": 1.223e-06, "loss": 0.0134, "num_tokens": 1868621.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 117.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09192944318056107, "kl": 0.014576570130884647, "learning_rate": 1.2226666666666668e-06, "loss": 0.0006, "num_tokens": 1868914.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 117.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.656205177307129, "kl": 0.06435461342334747, "learning_rate": 1.2223333333333333e-06, "loss": 0.0036, "num_tokens": 1869248.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.00018164316134061664, "kl": 5.990266799926758e-06, "learning_rate": 1.222e-06, "loss": 0.0, "num_tokens": 1869468.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 117.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.1757955104112625, "kl": 0.05981072038412094, "learning_rate": 1.2216666666666667e-06, "loss": 0.0029, "num_tokens": 1869774.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 117.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.13805396854877472, "kl": 0.017180890077725053, "learning_rate": 1.2213333333333333e-06, "loss": 0.0009, "num_tokens": 1870068.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 117.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03243164345622063, "kl": 0.004565113689750433, "learning_rate": 1.221e-06, "loss": 0.0002, "num_tokens": 1870372.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 117.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.016014624387025833, "kl": 0.0007988003198988736, "learning_rate": 1.2206666666666668e-06, "loss": 0.0, "num_tokens": 1870634.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 117.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.478219985961914, "kl": 0.1537754898890853, "learning_rate": 1.2203333333333334e-06, "loss": 0.0279, "num_tokens": 1870938.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 117.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.061416368931531906, "kl": 0.005006879044231027, "learning_rate": 1.22e-06, "loss": 0.0003, "num_tokens": 1871265.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 117.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.1589430272579193, "kl": 0.02481132373213768, "learning_rate": 1.2196666666666667e-06, "loss": 0.0012, "num_tokens": 1871580.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 117.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05281635746359825, "kl": 0.03615730442106724, "learning_rate": 1.2193333333333333e-06, "loss": 0.0018, "num_tokens": 1871953.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 117.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.006940820254385471, "kl": 0.0001445829875592608, "learning_rate": 1.219e-06, "loss": 0.0, "num_tokens": 1872209.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 117.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0402204766869545, "kl": 0.009526151698082685, "learning_rate": 1.2186666666666667e-06, "loss": 0.0005, "num_tokens": 1872468.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 117.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.200370788574219, "kl": 0.1066192900761962, "learning_rate": 1.2183333333333332e-06, "loss": 0.0946, "num_tokens": 1872771.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 117.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.3626270294189453, "kl": 0.016668145544826984, "learning_rate": 1.218e-06, "loss": 0.1227, "num_tokens": 1873114.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 6347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 117.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.049479641020298004, "kl": 0.006176235852763057, "learning_rate": 1.2176666666666668e-06, "loss": 0.0003, "num_tokens": 1873396.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 117.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.15409399569034576, "kl": 0.011553944554179907, "learning_rate": 1.2173333333333334e-06, "loss": 0.0006, "num_tokens": 1873654.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 117.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.023245124146342278, "kl": 0.1609538048505783, "learning_rate": 1.217e-06, "loss": 0.008, "num_tokens": 1873963.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 117.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00604587746784091, "kl": 0.00033239772892557085, "learning_rate": 1.2166666666666667e-06, "loss": 0.0, "num_tokens": 1874235.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.688149929046631, "kl": 0.03636751603335142, "learning_rate": 1.2163333333333333e-06, "loss": 0.1454, "num_tokens": 1874512.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 117.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.012772396206855774, "kl": 0.09722325205802917, "learning_rate": 1.216e-06, "loss": 0.0049, "num_tokens": 1874884.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.061891283839941025, "kl": 0.004158852971158922, "learning_rate": 1.2156666666666667e-06, "loss": 0.0002, "num_tokens": 1875138.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 117.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05476026237010956, "kl": 0.03428677376359701, "learning_rate": 1.2153333333333332e-06, "loss": 0.0017, "num_tokens": 1875549.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 117.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.08282981067895889, "kl": 0.02204325655475259, "learning_rate": 1.215e-06, "loss": 0.0011, "num_tokens": 1875886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 117.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07960408926010132, "kl": 0.008824027609080076, "learning_rate": 1.2146666666666668e-06, "loss": 0.0004, "num_tokens": 1876216.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 117.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.04834507405757904, "kl": 0.001701838686130941, "learning_rate": 1.2143333333333334e-06, "loss": 0.0001, "num_tokens": 1876488.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02579350955784321, "kl": 0.00026485323905944824, "learning_rate": 1.2140000000000002e-06, "loss": 0.0, "num_tokens": 1876700.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 117.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.47691863775253296, "kl": 0.10419574286788702, "learning_rate": 1.2136666666666667e-06, "loss": 0.0046, "num_tokens": 1876991.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.019206425175070763, "kl": 0.0009514418197795749, "learning_rate": 1.2133333333333333e-06, "loss": 0.0, "num_tokens": 1877263.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 117.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05284281075000763, "kl": 0.009344940539449453, "learning_rate": 1.213e-06, "loss": 0.0005, "num_tokens": 1877527.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 117.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0187024287879467, "kl": 0.0004899409395875409, "learning_rate": 1.2126666666666666e-06, "loss": 0.0, "num_tokens": 1877760.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 117.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.7793655395507812, "kl": 0.044804759323596954, "learning_rate": 1.2123333333333332e-06, "loss": 0.1268, "num_tokens": 1878102.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 117.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.006062432657927275, "kl": 0.0003211127477698028, "learning_rate": 1.2120000000000002e-06, "loss": 0.0, "num_tokens": 1878416.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.012449974194169044, "kl": 0.00191789137898013, "learning_rate": 1.2116666666666668e-06, "loss": 0.0001, "num_tokens": 1878696.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 117.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02751999720931053, "kl": 0.009616275317966938, "learning_rate": 1.2113333333333334e-06, "loss": 0.0005, "num_tokens": 1879022.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 117.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04843913018703461, "kl": 0.002564593218266964, "learning_rate": 1.2110000000000001e-06, "loss": 0.0001, "num_tokens": 1879345.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 117.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06524127721786499, "kl": 0.030951189808547497, "learning_rate": 1.2106666666666667e-06, "loss": 0.0015, "num_tokens": 1879682.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.5153093338012695, "kl": 0.0432470440864563, "learning_rate": 1.2103333333333333e-06, "loss": -0.036, "num_tokens": 1879963.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 117.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01231038011610508, "kl": 0.0023086676374077797, "learning_rate": 1.21e-06, "loss": 0.0001, "num_tokens": 1880275.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 118.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.2309725284576416, "kl": 0.058851104229688644, "learning_rate": 1.2096666666666666e-06, "loss": -0.0089, "num_tokens": 1880639.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 118.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037684952840209007, "kl": 6.638467311859131e-05, "learning_rate": 1.2093333333333332e-06, "loss": 0.0, "num_tokens": 1880851.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04324539005756378, "kl": 0.00459635304287076, "learning_rate": 1.2090000000000002e-06, "loss": 0.0002, "num_tokens": 1881144.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.030519749969244003, "kl": 0.0010947728587780148, "learning_rate": 1.2086666666666668e-06, "loss": 0.0001, "num_tokens": 1881415.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 118.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.20741461217403412, "kl": 0.02500736666843295, "learning_rate": 1.2083333333333333e-06, "loss": 0.0012, "num_tokens": 1881711.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.2913591861724854, "kl": 0.06482912134379148, "learning_rate": 1.2080000000000001e-06, "loss": -0.2348, "num_tokens": 1882057.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 118.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.17156784236431122, "kl": 0.012978978455066681, "learning_rate": 1.2076666666666667e-06, "loss": 0.0007, "num_tokens": 1882323.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 118.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.009120596572756767, "kl": 0.0014048232696950436, "learning_rate": 1.2073333333333333e-06, "loss": 0.0001, "num_tokens": 1882619.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 118.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.18915152549743652, "kl": 0.05025875195860863, "learning_rate": 1.207e-06, "loss": 0.0025, "num_tokens": 1882923.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.01114656776189804, "kl": 0.0013795166742056608, "learning_rate": 1.2066666666666666e-06, "loss": 0.0001, "num_tokens": 1883183.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.04206422343850136, "kl": 0.0020734071731567383, "learning_rate": 1.2063333333333332e-06, "loss": 0.0001, "num_tokens": 1883447.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 118.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.05972485616803169, "kl": 0.006392789771780372, "learning_rate": 1.2060000000000002e-06, "loss": 0.0003, "num_tokens": 1883773.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 118.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05720462277531624, "kl": 0.009414592292159796, "learning_rate": 1.2056666666666668e-06, "loss": 0.0005, "num_tokens": 1884108.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 118.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.05203743651509285, "kl": 0.01133543811738491, "learning_rate": 1.2053333333333333e-06, "loss": 0.0006, "num_tokens": 1884372.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 118.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0206092968583107, "kl": 0.05186851881444454, "learning_rate": 1.2050000000000001e-06, "loss": 0.0026, "num_tokens": 1884704.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 118.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0783233642578125, "kl": 0.03990246541798115, "learning_rate": 1.2046666666666667e-06, "loss": 0.002, "num_tokens": 1885072.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.05945427715778351, "kl": 0.0060268850065767765, "learning_rate": 1.2043333333333333e-06, "loss": 0.0003, "num_tokens": 1885332.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 118.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03179972618818283, "kl": 0.006192940287292004, "learning_rate": 1.204e-06, "loss": 0.0003, "num_tokens": 1885600.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 118.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.13461224734783173, "kl": 0.02167446445673704, "learning_rate": 1.2036666666666666e-06, "loss": 0.0011, "num_tokens": 1885890.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 118.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.042726099491119385, "kl": 0.004009530181065202, "learning_rate": 1.2033333333333334e-06, "loss": 0.0002, "num_tokens": 1886218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 118.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1628943532705307, "kl": 0.02399781160056591, "learning_rate": 1.2030000000000002e-06, "loss": 0.0014, "num_tokens": 1886486.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.3520936965942383, "kl": 0.17479710280895233, "learning_rate": 1.2026666666666667e-06, "loss": -0.2146, "num_tokens": 1886842.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 118.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09283898025751114, "kl": 0.006904813519213349, "learning_rate": 1.2023333333333333e-06, "loss": 0.0004, "num_tokens": 1887082.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 118.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 5.517305374145508, "kl": 0.031096127349883318, "learning_rate": 1.202e-06, "loss": 0.0803, "num_tokens": 1887359.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 6395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 118.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.7017276287078857, "kl": 0.1395142897963524, "learning_rate": 1.2016666666666667e-06, "loss": -0.0128, "num_tokens": 1887719.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 118.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.019146645441651344, "kl": 0.0023343415232375264, "learning_rate": 1.2013333333333332e-06, "loss": 0.0001, "num_tokens": 1888033.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007597935618832707, "kl": 0.001225618296302855, "learning_rate": 1.201e-06, "loss": 0.0001, "num_tokens": 1888313.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 118.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05427346006035805, "kl": 0.2748561501502991, "learning_rate": 1.2006666666666666e-06, "loss": 0.0137, "num_tokens": 1888617.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.11704078316688538, "kl": 0.0389588437974453, "learning_rate": 1.2003333333333334e-06, "loss": 0.0019, "num_tokens": 1888890.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 118.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.054681889712810516, "kl": 0.001852313638664782, "learning_rate": 1.2000000000000002e-06, "loss": 0.0001, "num_tokens": 1889102.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 118.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06008169800043106, "kl": 0.01628345251083374, "learning_rate": 1.1996666666666667e-06, "loss": 0.0009, "num_tokens": 1889431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 118.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.1296497881412506, "kl": 0.027248432859778404, "learning_rate": 1.1993333333333333e-06, "loss": 0.0014, "num_tokens": 1889751.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 118.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01889699138700962, "kl": 0.0012212160945637152, "learning_rate": 1.199e-06, "loss": 0.0001, "num_tokens": 1890060.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.021963927894830704, "kl": 0.001077877648640424, "learning_rate": 1.1986666666666667e-06, "loss": 0.0001, "num_tokens": 1890340.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 118.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06439710408449173, "kl": 0.001757104037096724, "learning_rate": 1.1983333333333334e-06, "loss": 0.0001, "num_tokens": 1890562.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 118.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05360562726855278, "kl": 0.008765297010540962, "learning_rate": 1.198e-06, "loss": 0.0004, "num_tokens": 1890844.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 118.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.01481380220502615, "kl": 0.0019328041234984994, "learning_rate": 1.1976666666666666e-06, "loss": 0.0001, "num_tokens": 1891126.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.037906572222709656, "kl": 0.004823529860004783, "learning_rate": 1.1973333333333334e-06, "loss": 0.0002, "num_tokens": 1891419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 118.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 4.52617883682251, "kl": 0.010026630014181137, "learning_rate": 1.1970000000000001e-06, "loss": -0.0778, "num_tokens": 1891717.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 6410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 118.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.15768961608409882, "kl": 0.06763637065887451, "learning_rate": 1.1966666666666667e-06, "loss": 0.0034, "num_tokens": 1892120.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 118.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.014757541008293629, "kl": 0.09695859253406525, "learning_rate": 1.1963333333333333e-06, "loss": 0.0048, "num_tokens": 1892492.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.05760321766138077, "kl": 0.013830000767484307, "learning_rate": 1.196e-06, "loss": 0.0007, "num_tokens": 1892762.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 118.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0746724009513855, "kl": 0.01655101589858532, "learning_rate": 1.1956666666666666e-06, "loss": 0.0008, "num_tokens": 1893069.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 118.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007975440821610391, "kl": 0.0037554726004600525, "learning_rate": 1.1953333333333334e-06, "loss": 0.0002, "num_tokens": 1893305.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 118.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.025200095027685165, "kl": 0.0032391101121902466, "learning_rate": 1.195e-06, "loss": 0.0002, "num_tokens": 1893521.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05634867399930954, "kl": 0.0026866591069847345, "learning_rate": 1.1946666666666666e-06, "loss": 0.0001, "num_tokens": 1893840.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 118.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00022323521261569113, "kl": 8.203089237213135e-06, "learning_rate": 1.1943333333333333e-06, "loss": 0.0, "num_tokens": 1894060.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 118.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01999380998313427, "kl": 0.0007640570402145386, "learning_rate": 1.1940000000000001e-06, "loss": 0.0, "num_tokens": 1894272.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 118.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 10.764765739440918, "kl": 0.015726592391729355, "learning_rate": 1.1936666666666667e-06, "loss": 0.2206, "num_tokens": 1894528.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 6420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.031007084995508194, "kl": 0.00040774644367047586, "learning_rate": 1.1933333333333335e-06, "loss": 0.0, "num_tokens": 1894784.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 118.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.9712905883789062, "kl": 0.1457047387957573, "learning_rate": 1.193e-06, "loss": -0.0244, "num_tokens": 1895106.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 118.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.586273193359375, "kl": 0.036043503787368536, "learning_rate": 1.1926666666666666e-06, "loss": 0.1291, "num_tokens": 1895407.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 118.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.3200676739215851, "kl": 0.017675194423645735, "learning_rate": 1.1923333333333334e-06, "loss": 0.0009, "num_tokens": 1895728.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 118.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07494300603866577, "kl": 0.035678806249052286, "learning_rate": 1.192e-06, "loss": 0.0018, "num_tokens": 1896022.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 119.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.021483000367879868, "kl": 0.0006169751286506653, "learning_rate": 1.1916666666666666e-06, "loss": 0.0, "num_tokens": 1896282.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 119.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.293263912200928, "kl": 0.08885498903691769, "learning_rate": 1.1913333333333335e-06, "loss": 0.1419, "num_tokens": 1896637.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 119.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06423847377300262, "kl": 0.009796207305043936, "learning_rate": 1.1910000000000001e-06, "loss": 0.0005, "num_tokens": 1896938.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 119.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03536780923604965, "kl": 0.0020239034784026444, "learning_rate": 1.1906666666666667e-06, "loss": 0.0001, "num_tokens": 1897261.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 119.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.032690420746803284, "kl": 0.002398474025540054, "learning_rate": 1.1903333333333335e-06, "loss": 0.0001, "num_tokens": 1897570.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 119.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07405785471200943, "kl": 0.0016200989484786987, "learning_rate": 1.19e-06, "loss": 0.0001, "num_tokens": 1897778.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 119.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.832571029663086, "kl": 0.4032875234261155, "learning_rate": 1.1896666666666666e-06, "loss": 0.044, "num_tokens": 1898039.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 119.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.16345266997814178, "kl": 0.012418131460435688, "learning_rate": 1.1893333333333334e-06, "loss": 0.0008, "num_tokens": 1898278.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 119.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.026300711557269096, "kl": 0.002880724292481318, "learning_rate": 1.189e-06, "loss": 0.0002, "num_tokens": 1898548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 119.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.048063069581985474, "kl": 0.0028284870786592364, "learning_rate": 1.1886666666666665e-06, "loss": 0.0001, "num_tokens": 1898821.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 119.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.018144050613045692, "kl": 0.09649721160531044, "learning_rate": 1.1883333333333335e-06, "loss": 0.0048, "num_tokens": 1899193.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 119.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.10659333318471909, "kl": 0.04098423197865486, "learning_rate": 1.188e-06, "loss": 0.0021, "num_tokens": 1899504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 119.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05494823306798935, "kl": 0.03383501060307026, "learning_rate": 1.1876666666666667e-06, "loss": 0.0017, "num_tokens": 1899863.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 119.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.012870515696704388, "kl": 0.0003301863180240616, "learning_rate": 1.1873333333333335e-06, "loss": 0.0, "num_tokens": 1900106.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 119.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 1.515233039855957, "kl": 0.8560158014297485, "learning_rate": 1.187e-06, "loss": 0.0608, "num_tokens": 1900411.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 119.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02664221078157425, "kl": 0.007528051733970642, "learning_rate": 1.1866666666666666e-06, "loss": 0.0004, "num_tokens": 1900671.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 119.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.00022559388889931142, "kl": 9.864568710327148e-06, "learning_rate": 1.1863333333333334e-06, "loss": 0.0, "num_tokens": 1900891.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 119.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.017285572364926338, "kl": 0.0004815608263015747, "learning_rate": 1.186e-06, "loss": 0.0, "num_tokens": 1901103.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 119.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.845367670059204, "kl": 0.18102366849780083, "learning_rate": 1.1856666666666665e-06, "loss": 0.0167, "num_tokens": 1901450.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 6444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 119.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006941179162822664, "kl": 0.0012119284365326166, "learning_rate": 1.1853333333333335e-06, "loss": 0.0001, "num_tokens": 1901730.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 119.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.012484087608754635, "kl": 0.0045114741660654545, "learning_rate": 1.185e-06, "loss": 0.0002, "num_tokens": 1902024.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 119.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.7878062725067139, "kl": 0.10964963585138321, "learning_rate": 1.1846666666666667e-06, "loss": -0.08, "num_tokens": 1902396.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 6447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 119.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.014289182610809803, "kl": 0.0016539028147235513, "learning_rate": 1.1843333333333334e-06, "loss": 0.0001, "num_tokens": 1902680.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 119.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.006454470567405224, "kl": 7.849186658859253e-05, "learning_rate": 1.184e-06, "loss": 0.0, "num_tokens": 1902892.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 119.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07206703722476959, "kl": 0.014787786640226841, "learning_rate": 1.1836666666666666e-06, "loss": 0.0007, "num_tokens": 1903199.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 119.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1333763152360916, "kl": 0.03642389178276062, "learning_rate": 1.1833333333333334e-06, "loss": 0.0018, "num_tokens": 1903489.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 119.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.21446919441223145, "kl": 0.0358577836304903, "learning_rate": 1.183e-06, "loss": 0.0017, "num_tokens": 1903759.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 119.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0662379041314125, "kl": 0.014518793672323227, "learning_rate": 1.1826666666666667e-06, "loss": 0.0007, "num_tokens": 1904097.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 119.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.020288633182644844, "kl": 0.005442020716145635, "learning_rate": 1.1823333333333335e-06, "loss": 0.0003, "num_tokens": 1904365.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 119.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.059898924082517624, "kl": 0.00407529016956687, "learning_rate": 1.182e-06, "loss": 0.0002, "num_tokens": 1904661.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 119.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.045951537787914276, "kl": 0.012956413440406322, "learning_rate": 1.1816666666666666e-06, "loss": 0.0007, "num_tokens": 1904989.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 119.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014931109035387635, "kl": 0.0013553500175476074, "learning_rate": 1.1813333333333334e-06, "loss": 0.0001, "num_tokens": 1905205.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 119.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0833604484796524, "kl": 0.016794190276414156, "learning_rate": 1.181e-06, "loss": 0.0008, "num_tokens": 1905541.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 119.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.9779343605041504, "kl": 0.062278375029563904, "learning_rate": 1.1806666666666666e-06, "loss": -0.0754, "num_tokens": 1905901.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 6459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 119.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008301954949274659, "kl": 0.0037469416856765747, "learning_rate": 1.1803333333333334e-06, "loss": 0.0002, "num_tokens": 1906137.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 119.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.4293761253356934, "kl": 0.10022081807255745, "learning_rate": 1.18e-06, "loss": 0.1051, "num_tokens": 1906457.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 119.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.12528786063194275, "kl": 0.017758074216544628, "learning_rate": 1.1796666666666667e-06, "loss": 0.001, "num_tokens": 1906759.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 119.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03259257972240448, "kl": 0.0041419247863814235, "learning_rate": 1.1793333333333335e-06, "loss": 0.0002, "num_tokens": 1907090.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 119.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032860408537089825, "kl": 7.447600364685059e-05, "learning_rate": 1.179e-06, "loss": 0.0, "num_tokens": 1907350.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 119.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0377628318965435, "kl": 0.032129768282175064, "learning_rate": 1.1786666666666666e-06, "loss": 0.0016, "num_tokens": 1907760.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 119.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.022190527990460396, "kl": 0.0008364841341972351, "learning_rate": 1.1783333333333334e-06, "loss": 0.0, "num_tokens": 1908020.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 119.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02222239039838314, "kl": 0.0007166534633142874, "learning_rate": 1.178e-06, "loss": 0.0, "num_tokens": 1908239.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 119.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02523590251803398, "kl": 0.001289821113459766, "learning_rate": 1.1776666666666668e-06, "loss": 0.0001, "num_tokens": 1908505.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 119.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.2763098478317261, "kl": 0.023756375536322594, "learning_rate": 1.1773333333333333e-06, "loss": 0.0012, "num_tokens": 1908773.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 119.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03187099099159241, "kl": 0.0006134450304671191, "learning_rate": 1.177e-06, "loss": 0.0, "num_tokens": 1909029.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 119.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 4.4135613441467285, "kl": 0.18188875913619995, "learning_rate": 1.1766666666666667e-06, "loss": 0.2793, "num_tokens": 1909377.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 119.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.043207310140132904, "kl": 0.00531318667344749, "learning_rate": 1.1763333333333335e-06, "loss": 0.0003, "num_tokens": 1909673.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 119.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.019358597695827484, "kl": 0.00369433150626719, "learning_rate": 1.176e-06, "loss": 0.0002, "num_tokens": 1909973.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 119.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.052679259330034256, "kl": 0.006142878322862089, "learning_rate": 1.1756666666666666e-06, "loss": 0.0003, "num_tokens": 1910305.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 119.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05763982981443405, "kl": 0.008953645825386047, "learning_rate": 1.1753333333333334e-06, "loss": 0.0004, "num_tokens": 1910587.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 119.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.699066162109375, "kl": 0.014039483503438532, "learning_rate": 1.175e-06, "loss": 0.0686, "num_tokens": 1910918.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 119.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.009389546699821949, "kl": 0.001685982570052147, "learning_rate": 1.1746666666666668e-06, "loss": 0.0001, "num_tokens": 1911230.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 119.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.004241365008056164, "kl": 0.0001102412716136314, "learning_rate": 1.1743333333333333e-06, "loss": 0.0, "num_tokens": 1911502.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 119.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 1.8897757530212402, "kl": 0.3033977091545239, "learning_rate": 1.174e-06, "loss": 0.0152, "num_tokens": 1911762.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 120.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.009460690431296825, "kl": 0.00846131145954132, "learning_rate": 1.1736666666666667e-06, "loss": 0.0004, "num_tokens": 1912034.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 120.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077101378701627254, "kl": 0.00013370811939239502, "learning_rate": 1.1733333333333335e-06, "loss": 0.0, "num_tokens": 1912246.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6481 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 120.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.2743289470672607, "kl": 0.1450158953666687, "learning_rate": 1.173e-06, "loss": -0.0709, "num_tokens": 1912602.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 120.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06430388242006302, "kl": 0.027581464499235153, "learning_rate": 1.1726666666666668e-06, "loss": 0.0014, "num_tokens": 1912900.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 120.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03585256636142731, "kl": 0.010957730002701283, "learning_rate": 1.1723333333333334e-06, "loss": 0.0006, "num_tokens": 1913227.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.4235100746154785, "kl": 0.03472807363141328, "learning_rate": 1.172e-06, "loss": 0.133, "num_tokens": 1913531.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 120.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02272755838930607, "kl": 0.0005561560392379761, "learning_rate": 1.1716666666666667e-06, "loss": 0.0, "num_tokens": 1913737.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021256390027701855, "kl": 0.001589758088812232, "learning_rate": 1.1713333333333333e-06, "loss": 0.0001, "num_tokens": 1914011.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 120.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1417020708322525, "kl": 0.05191383324563503, "learning_rate": 1.1709999999999999e-06, "loss": 0.0025, "num_tokens": 1914344.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 120.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0343189463019371, "kl": 0.011440465692430735, "learning_rate": 1.1706666666666669e-06, "loss": 0.0006, "num_tokens": 1914638.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 120.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08473880589008331, "kl": 0.019098554272204638, "learning_rate": 1.1703333333333335e-06, "loss": 0.001, "num_tokens": 1914925.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 120.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.020691394805908203, "kl": 0.0007858893950469792, "learning_rate": 1.17e-06, "loss": 0.0, "num_tokens": 1915205.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 120.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04949735105037689, "kl": 0.0351734422147274, "learning_rate": 1.1696666666666668e-06, "loss": 0.0018, "num_tokens": 1915479.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 120.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.01649233140051365, "kl": 0.00044474005699157715, "learning_rate": 1.1693333333333334e-06, "loss": 0.0, "num_tokens": 1915691.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 120.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 5.171275615692139, "kl": 0.2138686180114746, "learning_rate": 1.169e-06, "loss": -0.0128, "num_tokens": 1916052.0, "reward": 5.625, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 2.462214469909668, "step": 6494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 120.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.692605972290039, "kl": 0.013836213911417872, "learning_rate": 1.1686666666666667e-06, "loss": 0.0013, "num_tokens": 1916312.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 6495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 120.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.031527888029813766, "kl": 0.0036845599533990026, "learning_rate": 1.1683333333333333e-06, "loss": 0.0002, "num_tokens": 1916602.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 120.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.08643209934234619, "kl": 0.04004262760281563, "learning_rate": 1.1679999999999999e-06, "loss": 0.0019, "num_tokens": 1916934.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 120.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 4.578334808349609, "kl": 0.05969178630039096, "learning_rate": 1.1676666666666669e-06, "loss": 0.0489, "num_tokens": 1917277.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 120.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01702847331762314, "kl": 0.09659398719668388, "learning_rate": 1.1673333333333334e-06, "loss": 0.0048, "num_tokens": 1917649.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 120.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06658611446619034, "kl": 0.01085315365344286, "learning_rate": 1.167e-06, "loss": 0.0005, "num_tokens": 1917975.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 120.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.7283474206924438, "kl": 0.05486566200852394, "learning_rate": 1.1666666666666668e-06, "loss": 0.1116, "num_tokens": 1918396.0, "reward": 2.174999952316284, "reward_std": 1.649999976158142, "rewards/reward_combined/mean": 2.174999952316284, "rewards/reward_combined/std": 1.649999976158142, "step": 6501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 120.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00709752831608057, "kl": 0.00205912534147501, "learning_rate": 1.1663333333333334e-06, "loss": 0.0001, "num_tokens": 1918708.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 120.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.026969702914357185, "kl": 0.049299852922558784, "learning_rate": 1.166e-06, "loss": 0.0025, "num_tokens": 1919041.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 120.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.033657122403383255, "kl": 0.001314066001214087, "learning_rate": 1.1656666666666667e-06, "loss": 0.0001, "num_tokens": 1919276.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 120.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.023198192939162254, "kl": 0.0028923161153215915, "learning_rate": 1.1653333333333333e-06, "loss": 0.0001, "num_tokens": 1919578.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 120.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.031800609081983566, "kl": 0.26318879425525665, "learning_rate": 1.1649999999999999e-06, "loss": 0.0132, "num_tokens": 1919882.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 120.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.23323768377304077, "kl": 0.05135101266205311, "learning_rate": 1.1646666666666669e-06, "loss": 0.0026, "num_tokens": 1920236.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 120.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008731288253329694, "kl": 3.1054019927978516e-05, "learning_rate": 1.1643333333333334e-06, "loss": 0.0, "num_tokens": 1920456.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 120.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05076427012681961, "kl": 0.002242106245830655, "learning_rate": 1.164e-06, "loss": 0.0001, "num_tokens": 1920675.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 120.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03270020708441734, "kl": 0.005240541649982333, "learning_rate": 1.1636666666666668e-06, "loss": 0.0003, "num_tokens": 1920967.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 120.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008113679359667003, "kl": 0.0037462636828422546, "learning_rate": 1.1633333333333333e-06, "loss": 0.0002, "num_tokens": 1921203.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 120.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016736130928620696, "kl": 0.00025835633277893066, "learning_rate": 1.163e-06, "loss": 0.0, "num_tokens": 1921463.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 120.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071660554967820644, "kl": 0.0015529319643974304, "learning_rate": 1.1626666666666667e-06, "loss": 0.0001, "num_tokens": 1921679.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 120.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06511753052473068, "kl": 0.010602842550724745, "learning_rate": 1.1623333333333333e-06, "loss": 0.0005, "num_tokens": 1921978.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 120.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.029630031436681747, "kl": 0.0008384265820495784, "learning_rate": 1.162e-06, "loss": 0.0, "num_tokens": 1922221.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 120.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.004593794234097004, "kl": 0.00016801655146991834, "learning_rate": 1.1616666666666668e-06, "loss": 0.0, "num_tokens": 1922481.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07340265065431595, "kl": 0.01543547073379159, "learning_rate": 1.1613333333333334e-06, "loss": 0.0008, "num_tokens": 1922753.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 120.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.005003159400075674, "kl": 0.0001290440595766995, "learning_rate": 1.161e-06, "loss": 0.0, "num_tokens": 1923009.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07951806485652924, "kl": 0.008560521760955453, "learning_rate": 1.1606666666666668e-06, "loss": 0.0004, "num_tokens": 1923327.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 120.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.018592054024338722, "kl": 0.012788759544491768, "learning_rate": 1.1603333333333333e-06, "loss": 0.0006, "num_tokens": 1923587.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02817356027662754, "kl": 0.0023380888160318136, "learning_rate": 1.16e-06, "loss": 0.0001, "num_tokens": 1923860.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 120.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.148828029632568, "kl": 0.2056941445916891, "learning_rate": 1.1596666666666667e-06, "loss": 0.0948, "num_tokens": 1924137.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 120.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.10228823870420456, "kl": 0.010372804943472147, "learning_rate": 1.1593333333333333e-06, "loss": 0.0005, "num_tokens": 1924407.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 120.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.2594919800758362, "kl": 0.03747236914932728, "learning_rate": 1.159e-06, "loss": 0.0022, "num_tokens": 1924732.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 120.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03636854514479637, "kl": 0.00693343306920724, "learning_rate": 1.1586666666666668e-06, "loss": 0.0004, "num_tokens": 1925000.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 120.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005781413055956364, "kl": 0.0004506640980252996, "learning_rate": 1.1583333333333334e-06, "loss": 0.0, "num_tokens": 1925312.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 120.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.041753239929676056, "kl": 0.15194011479616165, "learning_rate": 1.158e-06, "loss": 0.0075, "num_tokens": 1925631.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 120.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.4301632046699524, "kl": 0.05248394142836332, "learning_rate": 1.1576666666666667e-06, "loss": 0.003, "num_tokens": 1925913.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 120.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.045214906334877014, "kl": 0.020316094160079956, "learning_rate": 1.1573333333333333e-06, "loss": 0.001, "num_tokens": 1926213.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 120.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03149750456213951, "kl": 0.0044946682173758745, "learning_rate": 1.157e-06, "loss": 0.0002, "num_tokens": 1926481.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 120.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06995546817779541, "kl": 0.010226914193481207, "learning_rate": 1.1566666666666667e-06, "loss": 0.0005, "num_tokens": 1926769.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 120.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.14868246018886566, "kl": 0.019098061602562666, "learning_rate": 1.1563333333333332e-06, "loss": 0.001, "num_tokens": 1927105.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.009264439344406128, "kl": 0.0015327318105846643, "learning_rate": 1.156e-06, "loss": 0.0001, "num_tokens": 1927387.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 121.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03600781783461571, "kl": 0.00176073465263471, "learning_rate": 1.1556666666666668e-06, "loss": 0.0001, "num_tokens": 1927708.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 121.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03034849837422371, "kl": 0.0027270345017313957, "learning_rate": 1.1553333333333334e-06, "loss": 0.0001, "num_tokens": 1927980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 121.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.809662818908691, "kl": 0.01322916243225336, "learning_rate": 1.155e-06, "loss": 0.0459, "num_tokens": 1928308.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 121.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033074396196752787, "kl": 0.00017439574003219604, "learning_rate": 1.1546666666666667e-06, "loss": 0.0, "num_tokens": 1928552.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 121.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012906003976240754, "kl": 0.00033399835228919983, "learning_rate": 1.1543333333333333e-06, "loss": 0.0, "num_tokens": 1928812.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 121.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08773849159479141, "kl": 0.001112423837184906, "learning_rate": 1.154e-06, "loss": 0.0001, "num_tokens": 1929024.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 121.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.141241654753685, "kl": 0.04679136909544468, "learning_rate": 1.1536666666666667e-06, "loss": 0.0024, "num_tokens": 1929341.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 121.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007866517407819629, "kl": 0.003756478428840637, "learning_rate": 1.1533333333333332e-06, "loss": 0.0002, "num_tokens": 1929577.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 121.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04516836255788803, "kl": 0.012945299968123436, "learning_rate": 1.153e-06, "loss": 0.0006, "num_tokens": 1929838.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 121.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.11207715421915054, "kl": 0.058697886765003204, "learning_rate": 1.1526666666666668e-06, "loss": 0.0029, "num_tokens": 1930179.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 121.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.030223773792386055, "kl": 0.2634342759847641, "learning_rate": 1.1523333333333334e-06, "loss": 0.0132, "num_tokens": 1930483.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 121.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.00621010409668088, "kl": 0.00014281273070082534, "learning_rate": 1.1520000000000002e-06, "loss": 0.0, "num_tokens": 1930739.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 121.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003828973858617246, "kl": 0.001236374955624342, "learning_rate": 1.1516666666666667e-06, "loss": 0.0001, "num_tokens": 1931019.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 121.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03259090706706047, "kl": 0.00884171505458653, "learning_rate": 1.1513333333333333e-06, "loss": 0.0004, "num_tokens": 1931311.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 121.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.03392448276281357, "kl": 0.0034284861758351326, "learning_rate": 1.151e-06, "loss": 0.0002, "num_tokens": 1931641.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 121.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03775680437684059, "kl": 0.03889380767941475, "learning_rate": 1.1506666666666666e-06, "loss": 0.0019, "num_tokens": 1931917.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 121.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.12175801396369934, "kl": 0.010034102015197277, "learning_rate": 1.1503333333333332e-06, "loss": 0.0005, "num_tokens": 1932217.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 121.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02031847834587097, "kl": 0.006155602788567194, "learning_rate": 1.1500000000000002e-06, "loss": 0.0003, "num_tokens": 1932489.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 121.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.024707650765776634, "kl": 0.0016802847385406494, "learning_rate": 1.1496666666666668e-06, "loss": 0.0001, "num_tokens": 1932801.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 121.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.629100799560547, "kl": 0.19461789727210999, "learning_rate": 1.1493333333333334e-06, "loss": 0.0063, "num_tokens": 1933169.0, "reward": 6.625, "reward_std": 2.428133726119995, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.428133726119995, "step": 6553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 121.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10235032439231873, "kl": 0.02571127749979496, "learning_rate": 1.1490000000000001e-06, "loss": 0.0014, "num_tokens": 1933459.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 121.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.020946571603417397, "kl": 0.0007784941699355841, "learning_rate": 1.1486666666666667e-06, "loss": 0.0, "num_tokens": 1933693.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 121.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.010214767418801785, "kl": 0.0001605344732524827, "learning_rate": 1.1483333333333333e-06, "loss": 0.0, "num_tokens": 1933973.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 121.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568927109241486, "kl": 0.012763059698045254, "learning_rate": 1.148e-06, "loss": 0.0006, "num_tokens": 1934300.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 121.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.029653701931238174, "kl": 0.0013552189921028912, "learning_rate": 1.1476666666666666e-06, "loss": 0.0001, "num_tokens": 1934625.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 121.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.533745527267456, "kl": 0.026880485005676746, "learning_rate": 1.1473333333333332e-06, "loss": 0.1205, "num_tokens": 1934906.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 121.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.16515734791755676, "kl": 0.02492852951399982, "learning_rate": 1.1470000000000002e-06, "loss": 0.0011, "num_tokens": 1935246.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 121.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.10162846744060516, "kl": 0.010246307007037103, "learning_rate": 1.1466666666666668e-06, "loss": 0.0005, "num_tokens": 1935506.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 121.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.019135337322950363, "kl": 0.0056260202545672655, "learning_rate": 1.1463333333333333e-06, "loss": 0.0003, "num_tokens": 1935772.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 121.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009062511846423149, "kl": 0.008659596554934978, "learning_rate": 1.1460000000000001e-06, "loss": 0.0004, "num_tokens": 1936044.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 121.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.055110394954681396, "kl": 0.012972465250641108, "learning_rate": 1.1456666666666667e-06, "loss": 0.0007, "num_tokens": 1936372.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 121.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.004890459589660168, "kl": 0.00022017360606696457, "learning_rate": 1.1453333333333333e-06, "loss": 0.0, "num_tokens": 1936632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 121.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.027844659984111786, "kl": 0.004094152478501201, "learning_rate": 1.145e-06, "loss": 0.0002, "num_tokens": 1936922.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 121.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.12352059036493301, "kl": 0.007494664052501321, "learning_rate": 1.1446666666666666e-06, "loss": 0.0004, "num_tokens": 1937182.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 121.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.015663154423236847, "kl": 0.0023040270316414535, "learning_rate": 1.1443333333333332e-06, "loss": 0.0001, "num_tokens": 1937478.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 121.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011276880279183388, "kl": 0.15838884562253952, "learning_rate": 1.1440000000000002e-06, "loss": 0.0079, "num_tokens": 1937788.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 121.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.022900735959410667, "kl": 0.0010702908039093018, "learning_rate": 1.1436666666666668e-06, "loss": 0.0001, "num_tokens": 1938000.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 121.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.04618305340409279, "kl": 0.002308756113052368, "learning_rate": 1.1433333333333333e-06, "loss": 0.0001, "num_tokens": 1938204.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 121.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.12868893146514893, "kl": 0.032958365976810455, "learning_rate": 1.1430000000000001e-06, "loss": 0.0017, "num_tokens": 1938505.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 121.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051928190514445305, "kl": 0.00048614738625474274, "learning_rate": 1.1426666666666667e-06, "loss": 0.0, "num_tokens": 1938725.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 121.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.017940184101462364, "kl": 0.09643261134624481, "learning_rate": 1.1423333333333333e-06, "loss": 0.0048, "num_tokens": 1939097.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 121.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 1.6241710186004639, "kl": 0.07638740912079811, "learning_rate": 1.142e-06, "loss": 0.0086, "num_tokens": 1939464.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 121.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.14684060215950012, "kl": 0.014174801646731794, "learning_rate": 1.1416666666666666e-06, "loss": 0.0007, "num_tokens": 1939747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 121.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.07468777149915695, "kl": 0.019712856505066156, "learning_rate": 1.1413333333333334e-06, "loss": 0.001, "num_tokens": 1940048.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 121.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.10913591086864471, "kl": 0.03890548646450043, "learning_rate": 1.1410000000000002e-06, "loss": 0.0019, "num_tokens": 1940471.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 121.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.116952896118164, "kl": 0.08014311641454697, "learning_rate": 1.1406666666666667e-06, "loss": 0.1064, "num_tokens": 1940835.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 121.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06265262514352798, "kl": 0.010103950276970863, "learning_rate": 1.1403333333333333e-06, "loss": 0.0005, "num_tokens": 1941132.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 121.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.105821393430233, "kl": 0.04428875632584095, "learning_rate": 1.14e-06, "loss": 0.0022, "num_tokens": 1941484.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 121.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.788204669952393, "kl": 0.0600991346873343, "learning_rate": 1.1396666666666667e-06, "loss": 0.0432, "num_tokens": 1941785.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 121.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.018679805099964142, "kl": 0.005518015008419752, "learning_rate": 1.1393333333333332e-06, "loss": 0.0003, "num_tokens": 1942053.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 121.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007013689610175788, "kl": 2.3759901523590088e-05, "learning_rate": 1.139e-06, "loss": 0.0, "num_tokens": 1942273.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 121.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 6.533872604370117, "kl": 0.07464139349758625, "learning_rate": 1.1386666666666666e-06, "loss": -0.1319, "num_tokens": 1942593.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 121.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.038593973964452744, "kl": 0.0026437936176080257, "learning_rate": 1.1383333333333334e-06, "loss": 0.0001, "num_tokens": 1942899.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 121.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.13153290748596191, "kl": 0.008460809476673603, "learning_rate": 1.1380000000000002e-06, "loss": 0.0005, "num_tokens": 1943126.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 122.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.673837900161743, "kl": 0.02412175014615059, "learning_rate": 1.1376666666666667e-06, "loss": -0.0391, "num_tokens": 1943392.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 122.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.269593209028244, "kl": 0.022401707246899605, "learning_rate": 1.1373333333333333e-06, "loss": 0.0012, "num_tokens": 1943654.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 122.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007820813334546983, "kl": 0.003760233521461487, "learning_rate": 1.137e-06, "loss": 0.0002, "num_tokens": 1943890.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 122.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.017251137644052505, "kl": 0.0036438003880903125, "learning_rate": 1.1366666666666667e-06, "loss": 0.0002, "num_tokens": 1944158.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 122.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09193898737430573, "kl": 0.008601611480116844, "learning_rate": 1.1363333333333334e-06, "loss": 0.0004, "num_tokens": 1944451.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 122.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01775575429201126, "kl": 0.09643914923071861, "learning_rate": 1.136e-06, "loss": 0.0048, "num_tokens": 1944823.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 122.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004691155627369881, "kl": 0.0015607811510562897, "learning_rate": 1.1356666666666666e-06, "loss": 0.0001, "num_tokens": 1945135.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 122.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.6137292385101318, "kl": 0.1320759579539299, "learning_rate": 1.1353333333333334e-06, "loss": -0.0575, "num_tokens": 1945512.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 122.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.005975870881229639, "kl": 0.001468625690904446, "learning_rate": 1.1350000000000001e-06, "loss": 0.0001, "num_tokens": 1945731.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 122.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.037724416702985764, "kl": 0.0019186652498319745, "learning_rate": 1.1346666666666667e-06, "loss": 0.0001, "num_tokens": 1946058.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 122.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03191739693284035, "kl": 0.000301949679851532, "learning_rate": 1.1343333333333333e-06, "loss": 0.0, "num_tokens": 1946270.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 122.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.43631380796432495, "kl": 0.026331719011068344, "learning_rate": 1.134e-06, "loss": 0.0017, "num_tokens": 1946541.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 122.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.019121550023555756, "kl": 0.0008227803918998688, "learning_rate": 1.1336666666666666e-06, "loss": 0.0, "num_tokens": 1946776.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 122.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03276032581925392, "kl": 0.26299040019512177, "learning_rate": 1.1333333333333334e-06, "loss": 0.0131, "num_tokens": 1947080.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 122.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.026009004563093185, "kl": 0.0013636148360092193, "learning_rate": 1.133e-06, "loss": 0.0001, "num_tokens": 1947398.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 122.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.036825548857450485, "kl": 0.005071159917861223, "learning_rate": 1.1326666666666666e-06, "loss": 0.0003, "num_tokens": 1947686.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 122.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.36840197443962097, "kl": 0.05935653671622276, "learning_rate": 1.1323333333333333e-06, "loss": 0.003, "num_tokens": 1947962.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 122.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.017999956384301186, "kl": 0.012935727834701538, "learning_rate": 1.1320000000000001e-06, "loss": 0.0006, "num_tokens": 1948222.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 122.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.01400921493768692, "kl": 0.00029931643803138286, "learning_rate": 1.1316666666666667e-06, "loss": 0.0, "num_tokens": 1948465.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 122.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.001294066198170185, "kl": 4.2216479414491914e-05, "learning_rate": 1.1313333333333335e-06, "loss": 0.0, "num_tokens": 1948733.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 122.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07810121774673462, "kl": 0.020303184166550636, "learning_rate": 1.131e-06, "loss": 0.0011, "num_tokens": 1949017.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 122.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.1420530080795288, "kl": 0.03908430226147175, "learning_rate": 1.1306666666666666e-06, "loss": 0.002, "num_tokens": 1949322.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 122.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.29549914598464966, "kl": 0.027963336557149887, "learning_rate": 1.1303333333333334e-06, "loss": 0.0014, "num_tokens": 1949588.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 122.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02412734553217888, "kl": 0.001976420055143535, "learning_rate": 1.13e-06, "loss": 0.0001, "num_tokens": 1949868.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 122.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.000716892653144896, "kl": 2.3633241653442383e-05, "learning_rate": 1.1296666666666666e-06, "loss": 0.0, "num_tokens": 1950088.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 122.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.17835897207260132, "kl": 0.023855048464611173, "learning_rate": 1.1293333333333333e-06, "loss": 0.0013, "num_tokens": 1950428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 122.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06581725180149078, "kl": 0.011161921545863152, "learning_rate": 1.1290000000000001e-06, "loss": 0.0006, "num_tokens": 1950757.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 122.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.9422038793563843, "kl": 0.19157817773520947, "learning_rate": 1.1286666666666667e-06, "loss": 0.0095, "num_tokens": 1951112.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 122.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.47744083404541, "kl": 0.029025439638644457, "learning_rate": 1.1283333333333335e-06, "loss": 0.0026, "num_tokens": 1951441.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6616 }, { "clip_ratio/high_max": 0.009803921915590763, "clip_ratio/high_mean": 0.009803921915590763, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009803921915590763, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 122.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.328023910522461, "kl": 0.16562017053365707, "learning_rate": 1.128e-06, "loss": 0.1351, "num_tokens": 1951792.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 122.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.016530541703104973, "kl": 0.15855525434017181, "learning_rate": 1.1276666666666666e-06, "loss": 0.0079, "num_tokens": 1952102.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 122.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03119640238583088, "kl": 0.00044555962085723877, "learning_rate": 1.1273333333333334e-06, "loss": 0.0, "num_tokens": 1952310.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 122.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07498772442340851, "kl": 0.026659665629267693, "learning_rate": 1.127e-06, "loss": 0.0013, "num_tokens": 1952614.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 122.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.8686230182647705, "kl": 0.007708510383963585, "learning_rate": 1.1266666666666665e-06, "loss": 0.0326, "num_tokens": 1952955.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 6621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 122.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06849901378154755, "kl": 0.011428920784965158, "learning_rate": 1.1263333333333335e-06, "loss": 0.0005, "num_tokens": 1953233.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 122.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05124097689986229, "kl": 0.004757784656248987, "learning_rate": 1.126e-06, "loss": 0.0002, "num_tokens": 1953533.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 122.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 11.247563362121582, "kl": 0.015773415565490723, "learning_rate": 1.1256666666666667e-06, "loss": -0.0334, "num_tokens": 1953804.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 122.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.037757158279418945, "kl": 0.000987064908258617, "learning_rate": 1.1253333333333335e-06, "loss": 0.0, "num_tokens": 1954102.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 122.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.006865231785923243, "kl": 0.0014312155544757843, "learning_rate": 1.125e-06, "loss": 0.0001, "num_tokens": 1954379.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 122.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05403284728527069, "kl": 0.04023777320981026, "learning_rate": 1.1246666666666666e-06, "loss": 0.002, "num_tokens": 1954783.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 122.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.3230810165405273, "kl": 0.04933694563806057, "learning_rate": 1.1243333333333334e-06, "loss": 0.1275, "num_tokens": 1955164.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 122.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.016358578577637672, "kl": 0.00044018030166625977, "learning_rate": 1.124e-06, "loss": 0.0, "num_tokens": 1955376.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 122.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.040608134120702744, "kl": 0.013359877280890942, "learning_rate": 1.1236666666666665e-06, "loss": 0.0007, "num_tokens": 1955679.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 122.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0580645315349102, "kl": 0.005300495307892561, "learning_rate": 1.1233333333333335e-06, "loss": 0.0004, "num_tokens": 1955917.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 122.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.07330907881259918, "kl": 0.019006874412298203, "learning_rate": 1.123e-06, "loss": 0.001, "num_tokens": 1956261.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 122.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.07663213461637497, "kl": 0.020455674966797233, "learning_rate": 1.1226666666666667e-06, "loss": 0.0011, "num_tokens": 1956548.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 122.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0524519719183445, "kl": 0.0009693175088614225, "learning_rate": 1.1223333333333334e-06, "loss": 0.0, "num_tokens": 1956804.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 122.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.020544296130537987, "kl": 0.0055718638468533754, "learning_rate": 1.122e-06, "loss": 0.0003, "num_tokens": 1957072.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 122.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07634744793176651, "kl": 0.009925136109814048, "learning_rate": 1.1216666666666666e-06, "loss": 0.0005, "num_tokens": 1957372.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 122.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.004817161243408918, "kl": 0.00021977425058139488, "learning_rate": 1.1213333333333334e-06, "loss": 0.0, "num_tokens": 1957632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 122.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.011919735930860043, "kl": 0.0002291479249834083, "learning_rate": 1.121e-06, "loss": 0.0, "num_tokens": 1957912.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 122.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.20443905889987946, "kl": 0.07632733508944511, "learning_rate": 1.1206666666666667e-06, "loss": 0.004, "num_tokens": 1958272.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 122.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.025047261267900467, "kl": 0.0018797516240738332, "learning_rate": 1.1203333333333335e-06, "loss": 0.0001, "num_tokens": 1958532.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 122.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06778904050588608, "kl": 0.009510488947853446, "learning_rate": 1.12e-06, "loss": 0.0005, "num_tokens": 1958858.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 123.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.01747989654541, "kl": 0.01983852032572031, "learning_rate": 1.1196666666666666e-06, "loss": 0.1248, "num_tokens": 1959173.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 123.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05625295639038086, "kl": 0.05622301809489727, "learning_rate": 1.1193333333333334e-06, "loss": 0.0028, "num_tokens": 1959509.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 123.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.021355150267481804, "kl": 0.0058467877097427845, "learning_rate": 1.119e-06, "loss": 0.0003, "num_tokens": 1959777.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 123.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02639990858733654, "kl": 0.0004590049502439797, "learning_rate": 1.1186666666666666e-06, "loss": 0.0, "num_tokens": 1959990.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 123.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.14955127239227295, "kl": 0.03589828871190548, "learning_rate": 1.1183333333333334e-06, "loss": 0.0018, "num_tokens": 1960314.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 123.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007937393384054303, "kl": 0.0037530064582824707, "learning_rate": 1.118e-06, "loss": 0.0002, "num_tokens": 1960550.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07580048590898514, "kl": 0.01687697321176529, "learning_rate": 1.1176666666666667e-06, "loss": 0.0009, "num_tokens": 1960834.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 123.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016741666477173567, "kl": 0.00028426945209503174, "learning_rate": 1.1173333333333335e-06, "loss": 0.0, "num_tokens": 1961148.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 123.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.021388089284300804, "kl": 0.0009245864348486066, "learning_rate": 1.117e-06, "loss": 0.0, "num_tokens": 1961468.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 123.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.14308567345142365, "kl": 0.04327143356204033, "learning_rate": 1.1166666666666666e-06, "loss": 0.0021, "num_tokens": 1961821.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 123.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.09517300128936768, "kl": 0.011151891900226474, "learning_rate": 1.1163333333333334e-06, "loss": 0.0006, "num_tokens": 1962152.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.092129185795784, "kl": 0.01014585793018341, "learning_rate": 1.116e-06, "loss": 0.0005, "num_tokens": 1962434.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 123.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.0957694053649902, "kl": 0.047333190217614174, "learning_rate": 1.1156666666666668e-06, "loss": -0.0042, "num_tokens": 1962771.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 123.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.06209235265851021, "kl": 0.004939502105116844, "learning_rate": 1.1153333333333333e-06, "loss": 0.0003, "num_tokens": 1963033.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 123.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.030932197347283363, "kl": 0.0024331025779247284, "learning_rate": 1.115e-06, "loss": 0.0001, "num_tokens": 1963345.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 123.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.29408520460128784, "kl": 0.062339795753359795, "learning_rate": 1.1146666666666667e-06, "loss": 0.004, "num_tokens": 1963723.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 123.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.16003206372261047, "kl": 0.011077792500145733, "learning_rate": 1.1143333333333335e-06, "loss": 0.0006, "num_tokens": 1963997.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 123.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.027227364480495453, "kl": 0.047547515481710434, "learning_rate": 1.114e-06, "loss": 0.0024, "num_tokens": 1964401.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 123.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.06734632700681686, "kl": 0.002562224864959717, "learning_rate": 1.1136666666666666e-06, "loss": 0.0001, "num_tokens": 1964615.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 123.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.013867117464542389, "kl": 0.0001319587172474712, "learning_rate": 1.1133333333333334e-06, "loss": 0.0, "num_tokens": 1964871.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 123.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03693933039903641, "kl": 0.12788153439760208, "learning_rate": 1.113e-06, "loss": 0.0064, "num_tokens": 1965180.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 123.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.12861475348472595, "kl": 0.010458302684128284, "learning_rate": 1.1126666666666668e-06, "loss": 0.0005, "num_tokens": 1965446.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 123.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.110867977142334, "kl": 0.12117099389433861, "learning_rate": 1.1123333333333333e-06, "loss": 0.0026, "num_tokens": 1965815.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 123.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.3038673400878906, "kl": 0.3133438229560852, "learning_rate": 1.112e-06, "loss": 0.03, "num_tokens": 1966120.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 123.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.029912354424595833, "kl": 0.0029180452693253756, "learning_rate": 1.1116666666666667e-06, "loss": 0.0001, "num_tokens": 1966400.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 123.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00737079419195652, "kl": 9.790012700250372e-05, "learning_rate": 1.1113333333333335e-06, "loss": 0.0, "num_tokens": 1966670.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 123.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.013049686327576637, "kl": 0.0009212995209963992, "learning_rate": 1.111e-06, "loss": 0.0, "num_tokens": 1966966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 123.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.08290422707796097, "kl": 0.007014411268755794, "learning_rate": 1.1106666666666668e-06, "loss": 0.0004, "num_tokens": 1967256.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 123.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.3177599906921387, "kl": 0.07530639320611954, "learning_rate": 1.1103333333333334e-06, "loss": 0.0307, "num_tokens": 1967638.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 123.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.012634089216589928, "kl": 0.002322533298865892, "learning_rate": 1.11e-06, "loss": 0.0001, "num_tokens": 1967904.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 123.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.08547884225845337, "kl": 0.03303547203540802, "learning_rate": 1.1096666666666667e-06, "loss": 0.0016, "num_tokens": 1968222.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6672 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 123.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 6.624612331390381, "kl": 0.03873021062463522, "learning_rate": 1.1093333333333333e-06, "loss": -0.0982, "num_tokens": 1968537.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 123.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04499563202261925, "kl": 0.005090321647003293, "learning_rate": 1.1089999999999999e-06, "loss": 0.0003, "num_tokens": 1968869.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.12638144195079803, "kl": 0.035763099789619446, "learning_rate": 1.1086666666666667e-06, "loss": 0.0019, "num_tokens": 1969158.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 123.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042531960643827915, "kl": 0.0002468675374984741, "learning_rate": 1.1083333333333335e-06, "loss": 0.0, "num_tokens": 1969418.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0449579656124115, "kl": 0.0025136874173767865, "learning_rate": 1.108e-06, "loss": 0.0001, "num_tokens": 1969718.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 123.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.037679605185985565, "kl": 0.002508559846319258, "learning_rate": 1.1076666666666668e-06, "loss": 0.0001, "num_tokens": 1969978.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 123.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029916069470345974, "kl": 0.00013575702905654907, "learning_rate": 1.1073333333333334e-06, "loss": 0.0, "num_tokens": 1970222.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 123.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.02202909253537655, "kl": 0.0021343620028346777, "learning_rate": 1.107e-06, "loss": 0.0001, "num_tokens": 1970550.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 123.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.017070196568965912, "kl": 0.013190580997616053, "learning_rate": 1.1066666666666667e-06, "loss": 0.0007, "num_tokens": 1970810.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 123.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 4.008839130401611, "kl": 0.040121917612850666, "learning_rate": 1.1063333333333333e-06, "loss": 0.2553, "num_tokens": 1971154.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6682 }, { "clip_ratio/high_max": 0.006493506487458944, "clip_ratio/high_mean": 0.006493506487458944, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006493506487458944, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 123.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.4907705783843994, "kl": 0.07521631568670273, "learning_rate": 1.1059999999999999e-06, "loss": 0.0749, "num_tokens": 1971524.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 123.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01797998696565628, "kl": 0.0024140363093465567, "learning_rate": 1.1056666666666669e-06, "loss": 0.0001, "num_tokens": 1971806.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.226508140563965, "kl": 0.07636481896042824, "learning_rate": 1.1053333333333334e-06, "loss": 0.0038, "num_tokens": 1972109.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 123.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03057272918522358, "kl": 0.0019758939743041992, "learning_rate": 1.105e-06, "loss": 0.0001, "num_tokens": 1972381.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 123.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 12.118617057800293, "kl": 0.06876134339836426, "learning_rate": 1.1046666666666668e-06, "loss": -0.2826, "num_tokens": 1972612.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 6687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 123.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.537738800048828, "kl": 0.03233519662171602, "learning_rate": 1.1043333333333334e-06, "loss": 0.0307, "num_tokens": 1972886.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 123.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.013402629643678665, "kl": 0.00030046701431274414, "learning_rate": 1.104e-06, "loss": 0.0, "num_tokens": 1973098.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 123.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.018313413485884666, "kl": 0.09641879796981812, "learning_rate": 1.1036666666666667e-06, "loss": 0.0048, "num_tokens": 1973470.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 123.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.027211032807826996, "kl": 0.006547056371346116, "learning_rate": 1.1033333333333333e-06, "loss": 0.0003, "num_tokens": 1973760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 123.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076531157828867435, "kl": 0.0016411244869232178, "learning_rate": 1.1029999999999999e-06, "loss": 0.0001, "num_tokens": 1973976.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 123.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11016985774040222, "kl": 0.0048295200685970485, "learning_rate": 1.1026666666666669e-06, "loss": 0.0002, "num_tokens": 1974195.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 123.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009899057913571596, "kl": 3.697723150253296e-05, "learning_rate": 1.1023333333333334e-06, "loss": 0.0, "num_tokens": 1974415.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 123.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.3061322569847107, "kl": 0.05900104157626629, "learning_rate": 1.102e-06, "loss": 0.0029, "num_tokens": 1974708.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 124.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.009826003573834896, "kl": 0.0037278781237546355, "learning_rate": 1.1016666666666668e-06, "loss": 0.0002, "num_tokens": 1974968.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 124.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 8.27171516418457, "kl": 0.16502747312188148, "learning_rate": 1.1013333333333333e-06, "loss": 0.0402, "num_tokens": 1975297.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 124.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068075633607804775, "kl": 0.0001447594549972564, "learning_rate": 1.101e-06, "loss": 0.0, "num_tokens": 1975540.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 124.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.11371287703514099, "kl": 0.016994278877973557, "learning_rate": 1.1006666666666667e-06, "loss": 0.0008, "num_tokens": 1975833.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 124.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03819449245929718, "kl": 0.04529155418276787, "learning_rate": 1.1003333333333333e-06, "loss": 0.0023, "num_tokens": 1976237.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 124.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007268072804436088, "kl": 0.0037770047783851624, "learning_rate": 1.0999999999999998e-06, "loss": 0.0002, "num_tokens": 1976473.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 124.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.014118671417236, "kl": 0.02089185267686844, "learning_rate": 1.0996666666666668e-06, "loss": 0.1583, "num_tokens": 1976757.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 6702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 124.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.267642617225647, "kl": 0.024595767725259066, "learning_rate": 1.0993333333333334e-06, "loss": 0.0014, "num_tokens": 1977095.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 124.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.4866061210632324, "kl": 0.014712004223838449, "learning_rate": 1.099e-06, "loss": 0.028, "num_tokens": 1977413.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 124.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.09291088581085205, "kl": 0.05437632463872433, "learning_rate": 1.0986666666666668e-06, "loss": 0.0027, "num_tokens": 1977752.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 124.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.019979797303676605, "kl": 0.006237521922230371, "learning_rate": 1.0983333333333333e-06, "loss": 0.0003, "num_tokens": 1978024.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.01857931725680828, "kl": 0.0018653283914318308, "learning_rate": 1.098e-06, "loss": 0.0001, "num_tokens": 1978294.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 124.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.20679014921188354, "kl": 0.018989129282999784, "learning_rate": 1.0976666666666667e-06, "loss": 0.001, "num_tokens": 1978624.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 124.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.00860736332833767, "kl": 0.26743079721927643, "learning_rate": 1.0973333333333333e-06, "loss": 0.0134, "num_tokens": 1978928.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6709 }, { "clip_ratio/high_max": 0.017241379246115685, "clip_ratio/high_mean": 0.017241379246115685, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017241379246115685, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 124.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 3.3299930095672607, "kl": 0.037656157510355115, "learning_rate": 1.097e-06, "loss": -0.0178, "num_tokens": 1979229.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 6710 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.011904762126505375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011904762126505375, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 124.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.283064842224121, "kl": 0.055986179038882256, "learning_rate": 1.0966666666666668e-06, "loss": 0.0638, "num_tokens": 1979538.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 124.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.14586737751960754, "kl": 0.022765167523175478, "learning_rate": 1.0963333333333334e-06, "loss": 0.0012, "num_tokens": 1979859.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 124.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03695819154381752, "kl": 0.010335276369005442, "learning_rate": 1.096e-06, "loss": 0.0005, "num_tokens": 1980153.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 124.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.035331230610609055, "kl": 0.0004420280456542969, "learning_rate": 1.0956666666666668e-06, "loss": 0.0, "num_tokens": 1980409.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 124.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007162205874919891, "kl": 2.3193657398223877e-05, "learning_rate": 1.0953333333333333e-06, "loss": 0.0, "num_tokens": 1980629.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.3293073773384094, "kl": 0.04378224955871701, "learning_rate": 1.095e-06, "loss": 0.0023, "num_tokens": 1980909.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03382283076643944, "kl": 0.013266139198094606, "learning_rate": 1.0946666666666667e-06, "loss": 0.0007, "num_tokens": 1981183.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 124.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.6197506785392761, "kl": 0.0986871924251318, "learning_rate": 1.0943333333333332e-06, "loss": 0.005, "num_tokens": 1981552.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 124.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.4851492643356323, "kl": 0.42122837249189615, "learning_rate": 1.094e-06, "loss": 0.0094, "num_tokens": 1981881.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 124.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05468791350722313, "kl": 0.005747489631175995, "learning_rate": 1.0936666666666668e-06, "loss": 0.0003, "num_tokens": 1982097.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 124.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01509421318769455, "kl": 0.16073846071958542, "learning_rate": 1.0933333333333334e-06, "loss": 0.008, "num_tokens": 1982406.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 124.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0060296026058495045, "kl": 0.00043669344449881464, "learning_rate": 1.093e-06, "loss": 0.0, "num_tokens": 1982666.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 124.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.19843657314777374, "kl": 0.011688372935168445, "learning_rate": 1.0926666666666667e-06, "loss": 0.0006, "num_tokens": 1982899.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 124.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.12993422150611877, "kl": 0.0359414704144001, "learning_rate": 1.0923333333333333e-06, "loss": 0.0018, "num_tokens": 1983269.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.013794119469821453, "kl": 0.001781273982487619, "learning_rate": 1.092e-06, "loss": 0.0001, "num_tokens": 1983551.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 124.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.15650252997875214, "kl": 0.01949635287746787, "learning_rate": 1.0916666666666667e-06, "loss": 0.0012, "num_tokens": 1983819.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 124.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.02222519740462303, "kl": 0.0018115078564733267, "learning_rate": 1.0913333333333332e-06, "loss": 0.0001, "num_tokens": 1984097.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 124.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.030690694227814674, "kl": 0.002545369789004326, "learning_rate": 1.091e-06, "loss": 0.0001, "num_tokens": 1984379.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 124.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 6.276077747344971, "kl": 0.02794385515153408, "learning_rate": 1.0906666666666668e-06, "loss": 0.3721, "num_tokens": 1984650.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 6729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 124.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.01635040156543255, "kl": 0.0005342587828636169, "learning_rate": 1.0903333333333334e-06, "loss": 0.0, "num_tokens": 1984860.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 124.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04845886304974556, "kl": 0.011965001933276653, "learning_rate": 1.0900000000000002e-06, "loss": 0.0006, "num_tokens": 1985166.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 124.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03365279734134674, "kl": 0.0017847068229457363, "learning_rate": 1.0896666666666667e-06, "loss": 0.0001, "num_tokens": 1985472.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 124.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03229230269789696, "kl": 0.005544868065044284, "learning_rate": 1.0893333333333333e-06, "loss": 0.0003, "num_tokens": 1985763.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 124.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.1267152577638626, "kl": 0.009396258043125272, "learning_rate": 1.089e-06, "loss": 0.0005, "num_tokens": 1986090.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 124.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.09310396760702133, "kl": 0.034787725657224655, "learning_rate": 1.0886666666666666e-06, "loss": 0.0017, "num_tokens": 1986362.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 124.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02764524333178997, "kl": 0.0009856869583018124, "learning_rate": 1.0883333333333332e-06, "loss": 0.0001, "num_tokens": 1986578.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 124.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.1168983206152916, "kl": 0.018185105174779892, "learning_rate": 1.088e-06, "loss": 0.001, "num_tokens": 1986864.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 124.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.0319888591766357, "kl": 0.03988751722499728, "learning_rate": 1.0876666666666668e-06, "loss": 0.038, "num_tokens": 1987134.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 124.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.035117071121931076, "kl": 0.0029737174045294523, "learning_rate": 1.0873333333333334e-06, "loss": 0.0002, "num_tokens": 1987406.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 124.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.08694670349359512, "kl": 0.032046109437942505, "learning_rate": 1.0870000000000001e-06, "loss": 0.0015, "num_tokens": 1987761.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.07139554619789124, "kl": 0.0026694713160395622, "learning_rate": 1.0866666666666667e-06, "loss": 0.0001, "num_tokens": 1988057.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 124.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 0.3275354504585266, "kl": 0.29027828946709633, "learning_rate": 1.0863333333333333e-06, "loss": -0.0073, "num_tokens": 1988427.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 6742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 124.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.35377341508865356, "kl": 0.11048712581396103, "learning_rate": 1.086e-06, "loss": 0.0055, "num_tokens": 1988809.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 124.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03870866447687149, "kl": 0.003643101081252098, "learning_rate": 1.0856666666666666e-06, "loss": 0.0002, "num_tokens": 1989101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 124.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02676636353135109, "kl": 0.00027589499950408936, "learning_rate": 1.0853333333333332e-06, "loss": 0.0, "num_tokens": 1989313.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 124.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.00675484212115407, "kl": 0.0009605585946701467, "learning_rate": 1.0850000000000002e-06, "loss": 0.0, "num_tokens": 1989533.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 124.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.035259027034044266, "kl": 0.0031496757874265313, "learning_rate": 1.0846666666666668e-06, "loss": 0.0002, "num_tokens": 1989847.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 124.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.014846750535070896, "kl": 0.013692875858396292, "learning_rate": 1.0843333333333333e-06, "loss": 0.0007, "num_tokens": 1990107.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 124.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.411559104919434, "kl": 0.199774457141757, "learning_rate": 1.0840000000000001e-06, "loss": 0.0114, "num_tokens": 1990409.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 6749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 125.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.10730460286140442, "kl": 0.012488137930631638, "learning_rate": 1.0836666666666667e-06, "loss": 0.0006, "num_tokens": 1990669.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 125.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.871403217315674, "kl": 0.3770003356039524, "learning_rate": 1.0833333333333333e-06, "loss": -0.124, "num_tokens": 1990992.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 6751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 125.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.022102462127804756, "kl": 0.0010592732578516006, "learning_rate": 1.083e-06, "loss": 0.0001, "num_tokens": 1991304.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 125.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.19147978723049164, "kl": 0.01298466557636857, "learning_rate": 1.0826666666666666e-06, "loss": 0.0006, "num_tokens": 1991558.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 125.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05072171613574028, "kl": 0.01280858926475048, "learning_rate": 1.0823333333333332e-06, "loss": 0.0006, "num_tokens": 1991883.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 125.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.022947359830141068, "kl": 0.001926311815623194, "learning_rate": 1.0820000000000002e-06, "loss": 0.0001, "num_tokens": 1992143.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 125.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04130426421761513, "kl": 0.008857755921781063, "learning_rate": 1.0816666666666668e-06, "loss": 0.0004, "num_tokens": 1992409.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 125.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03974505886435509, "kl": 0.004520110785961151, "learning_rate": 1.0813333333333333e-06, "loss": 0.0002, "num_tokens": 1992625.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 125.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011156612075865269, "kl": 0.00018146783258998767, "learning_rate": 1.0810000000000001e-06, "loss": 0.0, "num_tokens": 1992895.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 125.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.41733357310295105, "kl": 0.12040858715772629, "learning_rate": 1.0806666666666667e-06, "loss": 0.0057, "num_tokens": 1993245.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 125.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08265991508960724, "kl": 0.012211547465994954, "learning_rate": 1.0803333333333333e-06, "loss": 0.0006, "num_tokens": 1993578.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 125.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 4.374009609222412, "kl": 0.06599153392016888, "learning_rate": 1.08e-06, "loss": -0.0183, "num_tokens": 1993936.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 125.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.020630493760108948, "kl": 0.006491988431662321, "learning_rate": 1.0796666666666666e-06, "loss": 0.0003, "num_tokens": 1994234.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 125.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.058466386049985886, "kl": 0.01436679670587182, "learning_rate": 1.0793333333333332e-06, "loss": 0.0008, "num_tokens": 1994508.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 125.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.022439507767558098, "kl": 0.03939470276236534, "learning_rate": 1.0790000000000002e-06, "loss": 0.002, "num_tokens": 1994913.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 125.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05073806270956993, "kl": 0.028877712786197662, "learning_rate": 1.0786666666666667e-06, "loss": 0.0014, "num_tokens": 1995268.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 125.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0413748063147068, "kl": 0.000565357506275177, "learning_rate": 1.0783333333333333e-06, "loss": 0.0, "num_tokens": 1995481.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 125.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03212542086839676, "kl": 0.0072274720296263695, "learning_rate": 1.078e-06, "loss": 0.0004, "num_tokens": 1995818.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 125.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.07369399815797806, "kl": 0.0012552738189697266, "learning_rate": 1.0776666666666667e-06, "loss": 0.0001, "num_tokens": 1996038.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 125.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.18399615585803986, "kl": 0.17982543259859085, "learning_rate": 1.0773333333333332e-06, "loss": 0.009, "num_tokens": 1996347.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 125.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.025085950270295143, "kl": 0.004822854418307543, "learning_rate": 1.077e-06, "loss": 0.0002, "num_tokens": 1996638.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 125.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.9007527828216553, "kl": 0.07894202030729502, "learning_rate": 1.0766666666666666e-06, "loss": 0.0383, "num_tokens": 1996924.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 125.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.08631592243909836, "kl": 0.03764430247247219, "learning_rate": 1.0763333333333334e-06, "loss": 0.0019, "num_tokens": 1997222.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 125.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.006870661396533251, "kl": 0.0009814202785491943, "learning_rate": 1.0760000000000002e-06, "loss": 0.0, "num_tokens": 1997442.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 125.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06275569647550583, "kl": 0.043732261285185814, "learning_rate": 1.0756666666666667e-06, "loss": 0.0022, "num_tokens": 1997746.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 125.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.041031163185834885, "kl": 0.0018593408167362213, "learning_rate": 1.0753333333333333e-06, "loss": 0.0001, "num_tokens": 1998006.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 125.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04532339423894882, "kl": 0.009565880056470633, "learning_rate": 1.075e-06, "loss": 0.0004, "num_tokens": 1998325.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 125.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.11307155340909958, "kl": 0.01451186928898096, "learning_rate": 1.0746666666666667e-06, "loss": 0.0007, "num_tokens": 1998631.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 125.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.016273561865091324, "kl": 0.00021418631877168082, "learning_rate": 1.0743333333333334e-06, "loss": 0.0, "num_tokens": 1998887.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 125.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008092403295449913, "kl": 2.7239322662353516e-05, "learning_rate": 1.074e-06, "loss": 0.0, "num_tokens": 1999107.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 125.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.03855562210083, "kl": 0.26775551214814186, "learning_rate": 1.0736666666666666e-06, "loss": -0.1546, "num_tokens": 1999451.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 6780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 125.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007580664823763072, "kl": 0.003774188458919525, "learning_rate": 1.0733333333333334e-06, "loss": 0.0002, "num_tokens": 1999687.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 125.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.4763793349266052, "kl": 0.058355243410915136, "learning_rate": 1.0730000000000001e-06, "loss": 0.0032, "num_tokens": 1999988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 125.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1053260862827301, "kl": 0.0136321063619107, "learning_rate": 1.0726666666666667e-06, "loss": 0.0007, "num_tokens": 2000261.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 125.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04106324538588524, "kl": 0.0023971308255568147, "learning_rate": 1.0723333333333333e-06, "loss": 0.0001, "num_tokens": 2000557.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 125.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.018377000465989113, "kl": 0.006600759224966168, "learning_rate": 1.072e-06, "loss": 0.0003, "num_tokens": 2000839.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 125.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.07627657055854797, "kl": 0.009154963190667331, "learning_rate": 1.0716666666666666e-06, "loss": 0.0004, "num_tokens": 2001157.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 125.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.007892654277384281, "kl": 0.2675549238920212, "learning_rate": 1.0713333333333334e-06, "loss": 0.0134, "num_tokens": 2001461.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 125.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027956257108598948, "kl": 0.00029599841218441725, "learning_rate": 1.071e-06, "loss": 0.0, "num_tokens": 2001723.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 125.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01695258729159832, "kl": 0.0005003288388252258, "learning_rate": 1.0706666666666666e-06, "loss": 0.0, "num_tokens": 2001933.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 125.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.05122031271457672, "kl": 0.01109203090891242, "learning_rate": 1.0703333333333333e-06, "loss": 0.0005, "num_tokens": 2002264.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 125.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.055741142481565475, "kl": 0.004283519694581628, "learning_rate": 1.0700000000000001e-06, "loss": 0.0002, "num_tokens": 2002555.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 125.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.691385269165039, "kl": 0.03416193334851414, "learning_rate": 1.0696666666666667e-06, "loss": 0.0071, "num_tokens": 2002804.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 125.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.047999780625104904, "kl": 0.005202792584896088, "learning_rate": 1.0693333333333335e-06, "loss": 0.0003, "num_tokens": 2003074.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 125.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 4.3485212326049805, "kl": 0.0012153839052189142, "learning_rate": 1.069e-06, "loss": -0.0263, "num_tokens": 2003388.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 125.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.16370108723640442, "kl": 0.0857635922729969, "learning_rate": 1.0686666666666666e-06, "loss": 0.0043, "num_tokens": 2003767.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 125.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07150498032569885, "kl": 0.0024751177115831524, "learning_rate": 1.0683333333333334e-06, "loss": 0.0001, "num_tokens": 2004041.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 125.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.3654537200927734, "kl": 0.02511216001585126, "learning_rate": 1.068e-06, "loss": 0.1187, "num_tokens": 2004374.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 125.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.08819403499364853, "kl": 0.011112306732684374, "learning_rate": 1.0676666666666666e-06, "loss": 0.0006, "num_tokens": 2004648.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 125.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.13529539108276367, "kl": 0.1144271045923233, "learning_rate": 1.0673333333333333e-06, "loss": 0.0057, "num_tokens": 2005020.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 125.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 9.918779373168945, "kl": 0.2785526819061488, "learning_rate": 1.0670000000000001e-06, "loss": -0.1962, "num_tokens": 2005251.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 6800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 125.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.016369061544537544, "kl": 0.01342594949528575, "learning_rate": 1.0666666666666667e-06, "loss": 0.0007, "num_tokens": 2005511.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 125.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08869292587041855, "kl": 0.028977181762456894, "learning_rate": 1.0663333333333335e-06, "loss": 0.0016, "num_tokens": 2005800.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 125.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.058695025742053986, "kl": 0.025796583853662014, "learning_rate": 1.066e-06, "loss": 0.0013, "num_tokens": 2006074.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 126.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.017242413014173508, "kl": 0.0017141809221357107, "learning_rate": 1.0656666666666666e-06, "loss": 0.0001, "num_tokens": 2006356.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 126.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.004666907712817192, "kl": 0.00030143558979034424, "learning_rate": 1.0653333333333334e-06, "loss": 0.0, "num_tokens": 2006668.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 126.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00892165768891573, "kl": 0.008973983582109213, "learning_rate": 1.065e-06, "loss": 0.0004, "num_tokens": 2006940.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01810777559876442, "kl": 0.001970936224097386, "learning_rate": 1.0646666666666665e-06, "loss": 0.0001, "num_tokens": 2007210.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 126.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.027897275984287262, "kl": 0.0008604814356658608, "learning_rate": 1.0643333333333335e-06, "loss": 0.0, "num_tokens": 2007532.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 126.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013669824693351984, "kl": 5.65648078918457e-05, "learning_rate": 1.064e-06, "loss": 0.0, "num_tokens": 2007752.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 126.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03997933119535446, "kl": 0.0038047805428504944, "learning_rate": 1.0636666666666667e-06, "loss": 0.0002, "num_tokens": 2007968.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007181710097938776, "kl": 0.0014414922916330397, "learning_rate": 1.0633333333333335e-06, "loss": 0.0001, "num_tokens": 2008245.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 126.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.8784403800964355, "kl": 0.16962391138076782, "learning_rate": 1.063e-06, "loss": 0.022, "num_tokens": 2008555.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 126.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.008212026208639145, "kl": 0.0004836122097913176, "learning_rate": 1.0626666666666666e-06, "loss": 0.0, "num_tokens": 2008771.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 126.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07865607738494873, "kl": 0.013101758435368538, "learning_rate": 1.0623333333333334e-06, "loss": 0.0007, "num_tokens": 2009083.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 126.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04477753862738609, "kl": 0.041311923414468765, "learning_rate": 1.062e-06, "loss": 0.0021, "num_tokens": 2009487.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 126.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02466147392988205, "kl": 0.0027008940232917666, "learning_rate": 1.0616666666666665e-06, "loss": 0.0001, "num_tokens": 2009775.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 126.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.09314874559640884, "kl": 0.025913180783391, "learning_rate": 1.0613333333333335e-06, "loss": 0.0013, "num_tokens": 2010089.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.06415952742099762, "kl": 0.010469916742295027, "learning_rate": 1.061e-06, "loss": 0.0005, "num_tokens": 2010371.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 126.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007136716158129275, "kl": 0.0037858039140701294, "learning_rate": 1.0606666666666667e-06, "loss": 0.0002, "num_tokens": 2010607.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 126.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.1873864233493805, "kl": 0.030180448666214943, "learning_rate": 1.0603333333333334e-06, "loss": 0.0015, "num_tokens": 2010876.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 126.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.10978024452924728, "kl": 0.020875709131360054, "learning_rate": 1.06e-06, "loss": 0.001, "num_tokens": 2011173.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 126.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.12644976377487183, "kl": 0.03895176900550723, "learning_rate": 1.0596666666666666e-06, "loss": 0.0019, "num_tokens": 2011498.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 126.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.5371469855308533, "kl": 0.062394075095653534, "learning_rate": 1.0593333333333334e-06, "loss": 0.0035, "num_tokens": 2011801.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 126.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.6179850101470947, "kl": 0.08570412918925285, "learning_rate": 1.059e-06, "loss": 0.0798, "num_tokens": 2012142.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 126.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.002032228047028184, "kl": 3.768404167203698e-05, "learning_rate": 1.0586666666666665e-06, "loss": 0.0, "num_tokens": 2012412.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 126.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.6470732092857361, "kl": 0.0667794025503099, "learning_rate": 1.0583333333333335e-06, "loss": 0.0031, "num_tokens": 2012647.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 126.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0456935316324234, "kl": 0.025839708745479584, "learning_rate": 1.058e-06, "loss": 0.0012, "num_tokens": 2012973.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 126.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03953197970986366, "kl": 0.006467314786277711, "learning_rate": 1.0576666666666666e-06, "loss": 0.0003, "num_tokens": 2013282.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 126.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0701349601149559, "kl": 0.033556840382516384, "learning_rate": 1.0573333333333334e-06, "loss": 0.0017, "num_tokens": 2013554.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 126.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.015966404229402542, "kl": 0.002121111494489014, "learning_rate": 1.057e-06, "loss": 0.0001, "num_tokens": 2013838.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 126.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.028778649866580963, "kl": 0.0002717167080845684, "learning_rate": 1.0566666666666666e-06, "loss": 0.0, "num_tokens": 2014051.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 126.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03152531385421753, "kl": 0.005723806796595454, "learning_rate": 1.0563333333333334e-06, "loss": 0.0003, "num_tokens": 2014378.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 126.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06886554509401321, "kl": 0.003276680188719183, "learning_rate": 1.056e-06, "loss": 0.0002, "num_tokens": 2014621.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 126.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.8294137120246887, "kl": 0.17252267152071, "learning_rate": 1.0556666666666667e-06, "loss": 0.0086, "num_tokens": 2014993.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 126.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.007763893809169531, "kl": 0.2675829231739044, "learning_rate": 1.0553333333333335e-06, "loss": 0.0134, "num_tokens": 2015297.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.027351368218660355, "kl": 0.01299174246378243, "learning_rate": 1.055e-06, "loss": 0.0007, "num_tokens": 2015571.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 126.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.024424538016319275, "kl": 0.005048321094363928, "learning_rate": 1.0546666666666666e-06, "loss": 0.0002, "num_tokens": 2015829.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6837 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.012820512987673283, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 126.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.530215263366699, "kl": 0.07901394739747047, "learning_rate": 1.0543333333333334e-06, "loss": 0.0404, "num_tokens": 2016155.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 126.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02266966551542282, "kl": 0.003983311471529305, "learning_rate": 1.054e-06, "loss": 0.0002, "num_tokens": 2016415.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 126.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.27441143989563, "kl": 0.1077931597828865, "learning_rate": 1.0536666666666668e-06, "loss": 0.0323, "num_tokens": 2016789.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.10084228962659836, "kl": 0.00857362465467304, "learning_rate": 1.0533333333333333e-06, "loss": 0.0004, "num_tokens": 2017110.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 126.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.15140412747859955, "kl": 0.025718985125422478, "learning_rate": 1.053e-06, "loss": 0.0012, "num_tokens": 2017448.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.013510082848370075, "kl": 0.0009592053538654, "learning_rate": 1.0526666666666667e-06, "loss": 0.0, "num_tokens": 2017744.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 126.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.14007802307605743, "kl": 0.04836008697748184, "learning_rate": 1.0523333333333335e-06, "loss": 0.0026, "num_tokens": 2018113.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 126.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 3.018435478210449, "kl": 0.05536913452669978, "learning_rate": 1.052e-06, "loss": 0.0598, "num_tokens": 2018442.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 6845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 126.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06786739081144333, "kl": 0.03475327789783478, "learning_rate": 1.0516666666666666e-06, "loss": 0.0017, "num_tokens": 2018781.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 126.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.08906006813049316, "kl": 0.007779575273161754, "learning_rate": 1.0513333333333334e-06, "loss": 0.0004, "num_tokens": 2019052.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 126.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.014465812593698502, "kl": 0.004050555871799588, "learning_rate": 1.051e-06, "loss": 0.0002, "num_tokens": 2019343.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 126.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.006947892718017101, "kl": 0.0010696232784539461, "learning_rate": 1.0506666666666668e-06, "loss": 0.0001, "num_tokens": 2019563.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 126.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02097518928349018, "kl": 0.011328531429171562, "learning_rate": 1.0503333333333333e-06, "loss": 0.0006, "num_tokens": 2019824.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 126.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07189391553401947, "kl": 0.04991878941655159, "learning_rate": 1.05e-06, "loss": 0.0025, "num_tokens": 2020195.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 126.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.209402561187744, "kl": 0.06212242541369051, "learning_rate": 1.0496666666666667e-06, "loss": 0.0056, "num_tokens": 2020455.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 6852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 126.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04531474411487579, "kl": 0.0009812377102207392, "learning_rate": 1.0493333333333335e-06, "loss": 0.0001, "num_tokens": 2020712.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 126.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.13391625881195068, "kl": 0.04770008102059364, "learning_rate": 1.049e-06, "loss": 0.0024, "num_tokens": 2021003.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 126.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.12648507952690125, "kl": 0.011427598306909204, "learning_rate": 1.0486666666666668e-06, "loss": 0.0006, "num_tokens": 2021281.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 126.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040601445361971855, "kl": 0.0003002174198627472, "learning_rate": 1.0483333333333334e-06, "loss": 0.0, "num_tokens": 2021541.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 126.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06571891903877258, "kl": 0.022734422236680984, "learning_rate": 1.048e-06, "loss": 0.0011, "num_tokens": 2021844.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 127.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.024485278874635696, "kl": 0.0006596893072128296, "learning_rate": 1.0476666666666667e-06, "loss": 0.0, "num_tokens": 2022052.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 127.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00030234840232878923, "kl": 9.119510195887415e-06, "learning_rate": 1.0473333333333333e-06, "loss": 0.0, "num_tokens": 2022324.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 127.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00649535097181797, "kl": 0.0005749553674831986, "learning_rate": 1.0469999999999999e-06, "loss": 0.0, "num_tokens": 2022584.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 127.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.8829007148742676, "kl": 0.2540580630302429, "learning_rate": 1.0466666666666667e-06, "loss": -0.0197, "num_tokens": 2022955.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 127.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.061333753168582916, "kl": 0.005895009380765259, "learning_rate": 1.0463333333333335e-06, "loss": 0.0003, "num_tokens": 2023259.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 127.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.6130591034889221, "kl": 0.06673680990934372, "learning_rate": 1.046e-06, "loss": 0.0036, "num_tokens": 2023543.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 127.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.013818405568599701, "kl": 0.00047837250167503953, "learning_rate": 1.0456666666666668e-06, "loss": 0.0, "num_tokens": 2023786.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 127.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.016355376690626144, "kl": 0.0022178636281751096, "learning_rate": 1.0453333333333334e-06, "loss": 0.0001, "num_tokens": 2024070.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 127.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.7307298183441162, "kl": 0.09066342934966087, "learning_rate": 1.045e-06, "loss": 0.0059, "num_tokens": 2024430.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 127.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0052855354733765125, "kl": 6.3976644014474e-05, "learning_rate": 1.0446666666666667e-06, "loss": 0.0, "num_tokens": 2024686.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 127.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.014830618165433407, "kl": 0.0012570849794428796, "learning_rate": 1.0443333333333333e-06, "loss": 0.0001, "num_tokens": 2024956.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 127.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04651113599538803, "kl": 0.006027818424627185, "learning_rate": 1.0439999999999999e-06, "loss": 0.0003, "num_tokens": 2025285.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 127.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.11370302736759186, "kl": 0.04116308130323887, "learning_rate": 1.0436666666666669e-06, "loss": 0.0021, "num_tokens": 2025654.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 127.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 3.6144707202911377, "kl": 0.018034445587545633, "learning_rate": 1.0433333333333334e-06, "loss": 0.1428, "num_tokens": 2025942.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 127.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.9137215614318848, "kl": 0.02229921519756317, "learning_rate": 1.043e-06, "loss": -0.0299, "num_tokens": 2026232.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 127.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.033870477229356766, "kl": 0.002436707552988082, "learning_rate": 1.0426666666666668e-06, "loss": 0.0001, "num_tokens": 2026504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 127.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.02643694542348385, "kl": 0.0031824863981455564, "learning_rate": 1.0423333333333334e-06, "loss": 0.0002, "num_tokens": 2026794.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 127.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.018859701231122017, "kl": 0.00046088120143394917, "learning_rate": 1.042e-06, "loss": 0.0, "num_tokens": 2027072.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 127.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03923534229397774, "kl": 0.0037227485445328057, "learning_rate": 1.0416666666666667e-06, "loss": 0.0002, "num_tokens": 2027402.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 127.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02452106401324272, "kl": 0.04445505887269974, "learning_rate": 1.0413333333333333e-06, "loss": 0.0022, "num_tokens": 2027806.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6877 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.012820512987673283, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 127.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.3960442543029785, "kl": 0.13830716768279672, "learning_rate": 1.0409999999999999e-06, "loss": 0.0445, "num_tokens": 2028134.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 127.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.2857022285461426, "kl": 0.11537757702171803, "learning_rate": 1.0406666666666669e-06, "loss": -0.0416, "num_tokens": 2028443.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 127.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.4905614852905273, "kl": 0.07780079916119576, "learning_rate": 1.0403333333333334e-06, "loss": 0.158, "num_tokens": 2028795.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 127.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.09077906608581543, "kl": 0.014146331697702408, "learning_rate": 1.04e-06, "loss": 0.0007, "num_tokens": 2029096.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 127.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.01411760225892067, "kl": 0.0014931396581232548, "learning_rate": 1.0396666666666668e-06, "loss": 0.0001, "num_tokens": 2029392.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 127.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01290932483971119, "kl": 0.0009142353956121951, "learning_rate": 1.0393333333333333e-06, "loss": 0.0, "num_tokens": 2029611.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 127.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.9439899921417236, "kl": 0.05025894194841385, "learning_rate": 1.039e-06, "loss": 0.1725, "num_tokens": 2029986.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 6884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 127.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.09740295261144638, "kl": 0.018897773697972298, "learning_rate": 1.0386666666666667e-06, "loss": 0.0009, "num_tokens": 2030306.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 127.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.022704126313328743, "kl": 0.0007961168885231018, "learning_rate": 1.0383333333333333e-06, "loss": 0.0, "num_tokens": 2030566.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 127.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.004377822857350111, "kl": 0.001944471150636673, "learning_rate": 1.0379999999999998e-06, "loss": 0.0001, "num_tokens": 2030878.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 127.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.025142759084701538, "kl": 0.0013110190629959106, "learning_rate": 1.0376666666666668e-06, "loss": 0.0001, "num_tokens": 2031090.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 127.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.031239144504070282, "kl": 0.01201264327391982, "learning_rate": 1.0373333333333334e-06, "loss": 0.0006, "num_tokens": 2031351.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 127.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.44546443223953247, "kl": 0.047297073528170586, "learning_rate": 1.037e-06, "loss": 0.0024, "num_tokens": 2031615.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 127.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03662622347474098, "kl": 0.0003695487976074219, "learning_rate": 1.0366666666666668e-06, "loss": 0.0, "num_tokens": 2031819.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 127.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03863541781902313, "kl": 0.025793186388909817, "learning_rate": 1.0363333333333333e-06, "loss": 0.0014, "num_tokens": 2032108.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 127.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05816757306456566, "kl": 0.015358704142272472, "learning_rate": 1.036e-06, "loss": 0.0008, "num_tokens": 2032438.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 127.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.662898063659668, "kl": 0.30820655077695847, "learning_rate": 1.0356666666666667e-06, "loss": 0.0362, "num_tokens": 2032787.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 127.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.022715888917446136, "kl": 0.005837082164362073, "learning_rate": 1.0353333333333333e-06, "loss": 0.0003, "num_tokens": 2033045.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 127.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.9034695625305176, "kl": 0.17113797832280397, "learning_rate": 1.035e-06, "loss": 0.0946, "num_tokens": 2033353.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 6896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 127.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04517418146133423, "kl": 0.264210045337677, "learning_rate": 1.0346666666666668e-06, "loss": 0.0132, "num_tokens": 2033658.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 127.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.04011456295847893, "kl": 0.0033448264002799988, "learning_rate": 1.0343333333333334e-06, "loss": 0.0002, "num_tokens": 2033874.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 127.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.24899545311927795, "kl": 0.027900994289666414, "learning_rate": 1.034e-06, "loss": 0.0014, "num_tokens": 2034150.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 127.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.014461315236985683, "kl": 0.09716923534870148, "learning_rate": 1.0336666666666668e-06, "loss": 0.0049, "num_tokens": 2034522.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 127.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.020893394947052002, "kl": 0.0010304323222953826, "learning_rate": 1.0333333333333333e-06, "loss": 0.0001, "num_tokens": 2034848.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 127.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.04372726380825043, "kl": 0.007625943282619119, "learning_rate": 1.033e-06, "loss": 0.0004, "num_tokens": 2035130.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 127.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.06292850524187088, "kl": 0.03100801259279251, "learning_rate": 1.0326666666666667e-06, "loss": 0.0016, "num_tokens": 2035447.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 127.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.010208632797002792, "kl": 0.0005754321900894865, "learning_rate": 1.0323333333333332e-06, "loss": 0.0, "num_tokens": 2035758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 127.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.006103401072323322, "kl": 9.500980377197266e-05, "learning_rate": 1.032e-06, "loss": 0.0, "num_tokens": 2035970.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 127.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.26111310720443726, "kl": 0.04130120389163494, "learning_rate": 1.0316666666666668e-06, "loss": 0.002, "num_tokens": 2036280.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 127.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.19271153211593628, "kl": 0.011852469586301595, "learning_rate": 1.0313333333333334e-06, "loss": 0.0006, "num_tokens": 2036514.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 127.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 3.519508027238771e-05, "kl": 2.1904706954956055e-06, "learning_rate": 1.031e-06, "loss": 0.0, "num_tokens": 2036734.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 127.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.033342402428388596, "kl": 0.0024372367188334465, "learning_rate": 1.0306666666666667e-06, "loss": 0.0001, "num_tokens": 2036994.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 127.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.015936752781271935, "kl": 0.1608145907521248, "learning_rate": 1.0303333333333333e-06, "loss": 0.008, "num_tokens": 2037303.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 127.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007167708827182651, "kl": 0.0037852823734283447, "learning_rate": 1.03e-06, "loss": 0.0002, "num_tokens": 2037539.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.045564018189907074, "kl": 0.013573323376476765, "learning_rate": 1.0296666666666667e-06, "loss": 0.0008, "num_tokens": 2037813.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 128.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.029641034081578255, "kl": 0.002852322009857744, "learning_rate": 1.0293333333333332e-06, "loss": 0.0001, "num_tokens": 2038140.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 128.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.010478866286575794, "kl": 0.0006081965693738312, "learning_rate": 1.029e-06, "loss": 0.0, "num_tokens": 2038463.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 128.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 9.3764009475708, "kl": 0.005206409317906946, "learning_rate": 1.0286666666666668e-06, "loss": -0.1718, "num_tokens": 2038702.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 6915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.019619235768914223, "kl": 0.0012550760293379426, "learning_rate": 1.0283333333333334e-06, "loss": 0.0001, "num_tokens": 2038974.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 128.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03320958837866783, "kl": 0.0025151907466351986, "learning_rate": 1.0280000000000002e-06, "loss": 0.0001, "num_tokens": 2039234.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 128.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07638654112815857, "kl": 0.0627257376909256, "learning_rate": 1.0276666666666667e-06, "loss": 0.0031, "num_tokens": 2039609.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 128.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03185605630278587, "kl": 0.004255507723428309, "learning_rate": 1.0273333333333333e-06, "loss": 0.0002, "num_tokens": 2039897.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 128.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 6.953604497539345e-06, "kl": 2.332031726837158e-06, "learning_rate": 1.027e-06, "loss": 0.0, "num_tokens": 2040117.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 128.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.08747190237045288, "kl": 0.007621487835422158, "learning_rate": 1.0266666666666666e-06, "loss": 0.0004, "num_tokens": 2040446.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 128.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07447715848684311, "kl": 0.010064984206110239, "learning_rate": 1.0263333333333332e-06, "loss": 0.0005, "num_tokens": 2040738.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6922 }, { "clip_ratio/high_max": 0.008196720853447914, "clip_ratio/high_mean": 0.008196720853447914, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008196720853447914, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 128.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.604742050170898, "kl": 0.05840044282376766, "learning_rate": 1.026e-06, "loss": 0.2223, "num_tokens": 2041063.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 6923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 128.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.036337144672870636, "kl": 0.015849258517846465, "learning_rate": 1.0256666666666668e-06, "loss": 0.0009, "num_tokens": 2041357.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 128.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.008893858641386032, "kl": 0.2673604488372803, "learning_rate": 1.0253333333333334e-06, "loss": 0.0134, "num_tokens": 2041661.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 128.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.039111483842134476, "kl": 0.008277757093310356, "learning_rate": 1.0250000000000001e-06, "loss": 0.0004, "num_tokens": 2041941.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.014161958359181881, "kl": 0.00192840991076082, "learning_rate": 1.0246666666666667e-06, "loss": 0.0001, "num_tokens": 2042223.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03009587526321411, "kl": 0.005230151815339923, "learning_rate": 1.0243333333333333e-06, "loss": 0.0003, "num_tokens": 2042491.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 128.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.032548196613788605, "kl": 0.010243687313050032, "learning_rate": 1.024e-06, "loss": 0.0005, "num_tokens": 2042818.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 128.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.12394523620605469, "kl": 0.005353704560548067, "learning_rate": 1.0236666666666666e-06, "loss": 0.0003, "num_tokens": 2043040.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 128.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.011096575297415257, "kl": 0.0028420157614164054, "learning_rate": 1.0233333333333332e-06, "loss": 0.0001, "num_tokens": 2043306.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 128.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.05578472465276718, "kl": 0.028132045175880194, "learning_rate": 1.0230000000000002e-06, "loss": 0.0014, "num_tokens": 2043582.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 128.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05788668617606163, "kl": 0.052555881440639496, "learning_rate": 1.0226666666666668e-06, "loss": 0.0026, "num_tokens": 2043922.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04197635501623154, "kl": 0.002831184887327254, "learning_rate": 1.0223333333333333e-06, "loss": 0.0001, "num_tokens": 2044220.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6934 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.009615384973585606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 128.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.555518388748169, "kl": 0.6037652678787708, "learning_rate": 1.0220000000000001e-06, "loss": -0.0275, "num_tokens": 2044554.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 128.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.01702209748327732, "kl": 0.0002147436262021074, "learning_rate": 1.0216666666666667e-06, "loss": 0.0, "num_tokens": 2044810.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 128.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007536497432738543, "kl": 0.00377655029296875, "learning_rate": 1.0213333333333333e-06, "loss": 0.0002, "num_tokens": 2045046.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 128.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005898133385926485, "kl": 0.00126456783618778, "learning_rate": 1.021e-06, "loss": 0.0001, "num_tokens": 2045326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 128.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05638109892606735, "kl": 0.008408102672547102, "learning_rate": 1.0206666666666666e-06, "loss": 0.0004, "num_tokens": 2045625.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 128.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.4065365195274353, "kl": 0.10554420202970505, "learning_rate": 1.0203333333333332e-06, "loss": 0.0058, "num_tokens": 2046011.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 128.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 9.84788990020752, "kl": 0.02879244275391102, "learning_rate": 1.0200000000000002e-06, "loss": 0.1904, "num_tokens": 2046232.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 128.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033319215290248394, "kl": 0.0002970360219478607, "learning_rate": 1.0196666666666668e-06, "loss": 0.0, "num_tokens": 2046492.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 128.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.013312243856489658, "kl": 0.09712640941143036, "learning_rate": 1.0193333333333333e-06, "loss": 0.0049, "num_tokens": 2046864.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 128.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.21970827877521515, "kl": 0.013636435483931564, "learning_rate": 1.0190000000000001e-06, "loss": 0.0007, "num_tokens": 2047176.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 128.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.3400464057922363, "kl": 0.12038914859294891, "learning_rate": 1.0186666666666667e-06, "loss": -0.0152, "num_tokens": 2047505.0, "reward": 2.0, "reward_std": 2.4494898319244385, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 2.4494898319244385, "step": 6945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 128.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.01786050945520401, "kl": 0.0004708681663032621, "learning_rate": 1.0183333333333333e-06, "loss": 0.0, "num_tokens": 2047785.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 128.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03709577023983002, "kl": 0.02674313634634018, "learning_rate": 1.018e-06, "loss": 0.0013, "num_tokens": 2048192.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 128.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.010438960045576096, "kl": 0.008320785127580166, "learning_rate": 1.0176666666666666e-06, "loss": 0.0004, "num_tokens": 2048464.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 128.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.782740592956543, "kl": 0.052046431228518486, "learning_rate": 1.0173333333333332e-06, "loss": 0.3258, "num_tokens": 2048790.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 128.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.012623009271919727, "kl": 0.1605926901102066, "learning_rate": 1.0170000000000002e-06, "loss": 0.008, "num_tokens": 2049099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 128.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.029894687235355377, "kl": 0.006722460733726621, "learning_rate": 1.0166666666666667e-06, "loss": 0.0003, "num_tokens": 2049430.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 128.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.08028239011764526, "kl": 0.02459784783422947, "learning_rate": 1.0163333333333333e-06, "loss": 0.0013, "num_tokens": 2049718.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 128.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.01449799258261919, "kl": 0.0005191815289435908, "learning_rate": 1.016e-06, "loss": 0.0, "num_tokens": 2049980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 128.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02134798839688301, "kl": 0.0008262942137662321, "learning_rate": 1.0156666666666667e-06, "loss": 0.0, "num_tokens": 2050196.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 128.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007016902789473534, "kl": 7.8251462582557e-05, "learning_rate": 1.0153333333333332e-06, "loss": 0.0, "num_tokens": 2050462.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 128.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.2811586558818817, "kl": 0.10004860907793045, "learning_rate": 1.015e-06, "loss": 0.005, "num_tokens": 2050831.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 128.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06033000349998474, "kl": 0.0011277824669377878, "learning_rate": 1.0146666666666666e-06, "loss": 0.0001, "num_tokens": 2051044.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 128.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.029922988265752792, "kl": 0.0030030515044927597, "learning_rate": 1.0143333333333334e-06, "loss": 0.0002, "num_tokens": 2051358.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 128.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.00816932413727045, "kl": 0.0001157522201538086, "learning_rate": 1.0140000000000002e-06, "loss": 0.0, "num_tokens": 2051566.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 128.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.042841535061597824, "kl": 0.0063300770707428455, "learning_rate": 1.0136666666666667e-06, "loss": 0.0003, "num_tokens": 2051834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 128.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.13512448966503143, "kl": 0.007924702949821949, "learning_rate": 1.0133333333333333e-06, "loss": 0.0004, "num_tokens": 2052134.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 128.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.09692475944757462, "kl": 0.03999147564172745, "learning_rate": 1.013e-06, "loss": 0.002, "num_tokens": 2052471.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 128.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.01623937301337719, "kl": 0.013393386267125607, "learning_rate": 1.0126666666666667e-06, "loss": 0.0007, "num_tokens": 2052731.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 128.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 6.519444465637207, "kl": 0.0775340348482132, "learning_rate": 1.0123333333333334e-06, "loss": 0.0781, "num_tokens": 2053025.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 128.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0452214814722538, "kl": 0.0024351441534236073, "learning_rate": 1.012e-06, "loss": 0.0001, "num_tokens": 2053259.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 129.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.1895883083343506, "kl": 0.12014993000775576, "learning_rate": 1.0116666666666666e-06, "loss": 0.0594, "num_tokens": 2053549.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 129.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.006553079001605511, "kl": 0.0038548040320165455, "learning_rate": 1.0113333333333334e-06, "loss": 0.0002, "num_tokens": 2053807.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 129.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.20902186632156372, "kl": 0.02006150223314762, "learning_rate": 1.0110000000000001e-06, "loss": 0.001, "num_tokens": 2054073.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 129.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03659868985414505, "kl": 0.00426034489646554, "learning_rate": 1.0106666666666667e-06, "loss": 0.0002, "num_tokens": 2054405.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 129.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.235028028488159, "kl": 0.04834963008761406, "learning_rate": 1.0103333333333333e-06, "loss": -0.0299, "num_tokens": 2054696.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 129.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.01068859826773405, "kl": 0.001036324305459857, "learning_rate": 1.01e-06, "loss": 0.0001, "num_tokens": 2054964.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 129.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03159612789750099, "kl": 0.0015633500879630446, "learning_rate": 1.0096666666666666e-06, "loss": 0.0001, "num_tokens": 2055236.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 129.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 1.5784212350845337, "kl": 0.114490807056427, "learning_rate": 1.0093333333333334e-06, "loss": 0.0171, "num_tokens": 2055577.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 129.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03063986822962761, "kl": 0.00534681836143136, "learning_rate": 1.009e-06, "loss": 0.0003, "num_tokens": 2055835.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 129.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.06779507547616959, "kl": 0.0009523332118988037, "learning_rate": 1.0086666666666666e-06, "loss": 0.0, "num_tokens": 2056047.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 129.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.011507448740303516, "kl": 0.00032563507556915283, "learning_rate": 1.0083333333333333e-06, "loss": 0.0, "num_tokens": 2056257.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 129.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08644238114356995, "kl": 0.034969511441886425, "learning_rate": 1.0080000000000001e-06, "loss": 0.0016, "num_tokens": 2056584.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 129.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05382801964879036, "kl": 0.009691030019894242, "learning_rate": 1.0076666666666667e-06, "loss": 0.0005, "num_tokens": 2056914.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 129.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.11369314044713974, "kl": 0.006775143556296825, "learning_rate": 1.0073333333333335e-06, "loss": 0.0003, "num_tokens": 2057148.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 129.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.688161849975586, "kl": 0.34048211574554443, "learning_rate": 1.007e-06, "loss": 0.0578, "num_tokens": 2057462.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 129.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.029017580673098564, "kl": 0.0009145679650828242, "learning_rate": 1.0066666666666666e-06, "loss": 0.0, "num_tokens": 2057746.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 129.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.8432042598724365, "kl": 0.050350496312603354, "learning_rate": 1.0063333333333334e-06, "loss": -0.0009, "num_tokens": 2058034.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 129.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01573021523654461, "kl": 0.09661111608147621, "learning_rate": 1.006e-06, "loss": 0.0048, "num_tokens": 2058406.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 129.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.00953968707472086, "kl": 0.0014483824488706887, "learning_rate": 1.0056666666666666e-06, "loss": 0.0001, "num_tokens": 2058688.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 129.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03229231387376785, "kl": 0.00474539038259536, "learning_rate": 1.0053333333333333e-06, "loss": 0.0002, "num_tokens": 2058978.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 129.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 3.6413832276593894e-05, "kl": 2.1979212760925293e-06, "learning_rate": 1.0050000000000001e-06, "loss": 0.0, "num_tokens": 2059198.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 129.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.13440021872520447, "kl": 0.03015802800655365, "learning_rate": 1.0046666666666667e-06, "loss": 0.0015, "num_tokens": 2059492.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 129.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033988712821155787, "kl": 0.00016928050172282383, "learning_rate": 1.0043333333333335e-06, "loss": 0.0, "num_tokens": 2059804.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 129.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.015135381370782852, "kl": 0.013556838035583496, "learning_rate": 1.004e-06, "loss": 0.0007, "num_tokens": 2060064.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 129.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.12105981260538101, "kl": 0.005264115141471848, "learning_rate": 1.0036666666666666e-06, "loss": 0.0003, "num_tokens": 2060286.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 129.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.4594961404800415, "kl": 0.03914309912943281, "learning_rate": 1.0033333333333334e-06, "loss": 0.0021, "num_tokens": 2060612.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 129.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.11597444117069244, "kl": 0.032896749675273895, "learning_rate": 1.003e-06, "loss": 0.0016, "num_tokens": 2060912.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 129.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.08201200515031815, "kl": 0.018918459303677082, "learning_rate": 1.0026666666666665e-06, "loss": 0.0009, "num_tokens": 2061230.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 129.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.148848056793213, "kl": 0.40280213207006454, "learning_rate": 1.0023333333333335e-06, "loss": -0.0136, "num_tokens": 2061595.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 129.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.486007422208786, "kl": 0.07580779306590557, "learning_rate": 1.002e-06, "loss": 0.0035, "num_tokens": 2061915.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 129.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.08523035049438477, "kl": 0.0449199303984642, "learning_rate": 1.0016666666666667e-06, "loss": 0.0023, "num_tokens": 2062186.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 129.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.006495171692222357, "kl": 0.0015818774700164795, "learning_rate": 1.0013333333333335e-06, "loss": 0.0001, "num_tokens": 2062402.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 129.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04938627406954765, "kl": 0.0037298735696822405, "learning_rate": 1.001e-06, "loss": 0.0002, "num_tokens": 2062645.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 129.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.022117979824543, "kl": 0.0022824269253760576, "learning_rate": 1.0006666666666666e-06, "loss": 0.0001, "num_tokens": 2062927.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 129.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.06404024362564087, "kl": 0.004563490976579487, "learning_rate": 1.0003333333333334e-06, "loss": 0.0002, "num_tokens": 2063181.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 129.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.021524703130126, "kl": 0.005385205149650574, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 2063449.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 129.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.011293207295238972, "kl": 0.0011240161256864667, "learning_rate": 9.996666666666665e-07, "loss": 0.0001, "num_tokens": 2063751.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 129.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00586632639169693, "kl": 0.00010887086318689398, "learning_rate": 9.993333333333335e-07, "loss": 0.0, "num_tokens": 2064007.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 129.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.9347095489501953, "kl": 0.49228236079216003, "learning_rate": 9.99e-07, "loss": 0.0046, "num_tokens": 2064361.0, "reward": 5.5, "reward_std": 2.6140644550323486, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 2.6140644550323486, "step": 7004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 129.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008292172569781542, "kl": 0.003763720393180847, "learning_rate": 9.986666666666667e-07, "loss": 0.0002, "num_tokens": 2064597.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 129.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.07730316370725632, "kl": 0.015745405107736588, "learning_rate": 9.983333333333334e-07, "loss": 0.0008, "num_tokens": 2064902.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 129.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 4.363114833831787, "kl": 0.027033142123400467, "learning_rate": 9.98e-07, "loss": -0.0831, "num_tokens": 2065174.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 129.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.007219251710921526, "kl": 0.0020350394770503044, "learning_rate": 9.976666666666666e-07, "loss": 0.0001, "num_tokens": 2065486.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 129.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.12257125228643417, "kl": 0.009929151739925146, "learning_rate": 9.973333333333334e-07, "loss": 0.0005, "num_tokens": 2065827.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 129.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.6105772256851196, "kl": 0.09189913421869278, "learning_rate": 9.97e-07, "loss": -0.065, "num_tokens": 2066192.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 129.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.009716881439089775, "kl": 0.2671699821949005, "learning_rate": 9.966666666666665e-07, "loss": 0.0134, "num_tokens": 2066496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 129.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.020831193774938583, "kl": 0.0019416631548665464, "learning_rate": 9.963333333333335e-07, "loss": 0.0001, "num_tokens": 2066768.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 129.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.10478778928518295, "kl": 0.015824542846530676, "learning_rate": 9.96e-07, "loss": 0.0008, "num_tokens": 2067101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 129.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.1142385005950928, "kl": 0.16468075662851334, "learning_rate": 9.956666666666666e-07, "loss": -0.0264, "num_tokens": 2067501.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 7014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 129.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.02332492545247078, "kl": 0.0010798722505569458, "learning_rate": 9.953333333333334e-07, "loss": 0.0001, "num_tokens": 2067713.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 129.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.012855621054768562, "kl": 0.0004808790981769562, "learning_rate": 9.95e-07, "loss": 0.0, "num_tokens": 2067973.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 129.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.07349342852830887, "kl": 0.017411372624337673, "learning_rate": 9.946666666666666e-07, "loss": 0.001, "num_tokens": 2068255.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 129.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.10274455696344376, "kl": 0.024848349392414093, "learning_rate": 9.943333333333334e-07, "loss": 0.0012, "num_tokens": 2068557.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 129.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.3386154174804688, "kl": 0.09217208810150623, "learning_rate": 9.94e-07, "loss": 0.1317, "num_tokens": 2068907.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 130.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.5648937225341797, "kl": 0.07233530622033868, "learning_rate": 9.936666666666667e-07, "loss": 0.1104, "num_tokens": 2069209.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 130.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.461804986000061, "kl": 0.12070186994969845, "learning_rate": 9.933333333333335e-07, "loss": 0.0869, "num_tokens": 2069554.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 7021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 130.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.027200795710086823, "kl": 0.006897842977195978, "learning_rate": 9.93e-07, "loss": 0.0003, "num_tokens": 2069886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 130.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 5.096599102020264, "kl": 0.05096434731967747, "learning_rate": 9.926666666666666e-07, "loss": 0.0858, "num_tokens": 2070186.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 130.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.009517742320895195, "kl": 0.26722897589206696, "learning_rate": 9.923333333333334e-07, "loss": 0.0134, "num_tokens": 2070490.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 130.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02489904686808586, "kl": 0.0003945454955101013, "learning_rate": 9.92e-07, "loss": 0.0, "num_tokens": 2070700.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 130.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02939853072166443, "kl": 0.006774634122848511, "learning_rate": 9.916666666666666e-07, "loss": 0.0003, "num_tokens": 2070972.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 130.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 0.4942113757133484, "kl": 0.2910715565085411, "learning_rate": 9.913333333333333e-07, "loss": -0.0058, "num_tokens": 2071340.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 7027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 59.75, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 130.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.380537986755371, "kl": 0.10103682242333889, "learning_rate": 9.91e-07, "loss": 0.2665, "num_tokens": 2071795.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 130.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.018369261175394058, "kl": 0.0023810090497136116, "learning_rate": 9.906666666666667e-07, "loss": 0.0001, "num_tokens": 2072107.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 130.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02092871069908142, "kl": 0.0007555286138085648, "learning_rate": 9.903333333333335e-07, "loss": 0.0, "num_tokens": 2072323.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 130.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03298458084464073, "kl": 0.006786221172660589, "learning_rate": 9.9e-07, "loss": 0.0003, "num_tokens": 2072646.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 130.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07828333228826523, "kl": 0.007808617083355784, "learning_rate": 9.896666666666666e-07, "loss": 0.0004, "num_tokens": 2072908.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 130.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.10994374006986618, "kl": 0.016406590584665537, "learning_rate": 9.893333333333334e-07, "loss": 0.0008, "num_tokens": 2073233.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 130.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 5.831333160400391, "kl": 0.04937233589589596, "learning_rate": 9.89e-07, "loss": 0.0972, "num_tokens": 2073559.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 130.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09697316586971283, "kl": 0.024668416008353233, "learning_rate": 9.886666666666668e-07, "loss": 0.0012, "num_tokens": 2073886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 130.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07783915847539902, "kl": 0.003793957643210888, "learning_rate": 9.883333333333333e-07, "loss": 0.0002, "num_tokens": 2074120.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 130.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.231584548950195, "kl": 0.025572039652615786, "learning_rate": 9.88e-07, "loss": 0.2435, "num_tokens": 2074445.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 130.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 3.1727960109710693, "kl": 0.5397544535808265, "learning_rate": 9.876666666666667e-07, "loss": 0.0284, "num_tokens": 2074706.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 130.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005854532122612, "kl": 0.04777650721371174, "learning_rate": 9.873333333333335e-07, "loss": 0.0024, "num_tokens": 2075110.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 130.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.014469306915998459, "kl": 0.0014208531356416643, "learning_rate": 9.87e-07, "loss": 0.0001, "num_tokens": 2075410.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 130.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03294166550040245, "kl": 0.0030110597144812346, "learning_rate": 9.866666666666668e-07, "loss": 0.0002, "num_tokens": 2075702.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 130.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.4369101524353027, "kl": 0.12501885369420052, "learning_rate": 9.863333333333334e-07, "loss": -0.1031, "num_tokens": 2076069.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 130.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.027926573529839516, "kl": 0.0006210476385604125, "learning_rate": 9.86e-07, "loss": 0.0, "num_tokens": 2076325.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 130.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.05098537355661392, "kl": 0.005229807575233281, "learning_rate": 9.856666666666667e-07, "loss": 0.0003, "num_tokens": 2076615.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 130.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.024386560544371605, "kl": 0.0015731000748928636, "learning_rate": 9.853333333333333e-07, "loss": 0.0001, "num_tokens": 2076888.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 130.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.021922960877418518, "kl": 0.006068588700145483, "learning_rate": 9.849999999999999e-07, "loss": 0.0003, "num_tokens": 2077156.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 130.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.11238568276166916, "kl": 0.03599953092634678, "learning_rate": 9.846666666666667e-07, "loss": 0.0018, "num_tokens": 2077454.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 130.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06677719205617905, "kl": 0.005988605320453644, "learning_rate": 9.843333333333335e-07, "loss": 0.0003, "num_tokens": 2077726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 130.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.024617446586489677, "kl": 0.002243560622446239, "learning_rate": 9.84e-07, "loss": 0.0001, "num_tokens": 2078052.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 130.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.010872730985283852, "kl": 0.00036402890691533685, "learning_rate": 9.836666666666668e-07, "loss": 0.0, "num_tokens": 2078366.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 130.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11603320389986038, "kl": 0.009916636859998107, "learning_rate": 9.833333333333334e-07, "loss": 0.0005, "num_tokens": 2078629.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 130.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03702886402606964, "kl": 0.011082913508289494, "learning_rate": 9.83e-07, "loss": 0.0006, "num_tokens": 2078916.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 130.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 9.439342829864472e-05, "kl": 3.2335519790649414e-06, "learning_rate": 9.826666666666667e-07, "loss": 0.0, "num_tokens": 2079136.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 130.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.000660731631796807, "kl": 0.0012536580907180905, "learning_rate": 9.823333333333333e-07, "loss": 0.0001, "num_tokens": 2079416.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 130.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1541651487350464, "kl": 0.005077527370303869, "learning_rate": 9.819999999999999e-07, "loss": 0.0002, "num_tokens": 2079629.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 130.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.046823784708976746, "kl": 0.009026986081153154, "learning_rate": 9.816666666666669e-07, "loss": 0.0005, "num_tokens": 2079915.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 130.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01296068076044321, "kl": 0.16126051545143127, "learning_rate": 9.813333333333334e-07, "loss": 0.0081, "num_tokens": 2080224.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 130.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984318375587463, "kl": 0.009020114550366998, "learning_rate": 9.81e-07, "loss": 0.0005, "num_tokens": 2080484.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 130.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.009963327087461948, "kl": 0.008633102290332317, "learning_rate": 9.806666666666668e-07, "loss": 0.0004, "num_tokens": 2080756.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 130.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.17390818893909454, "kl": 0.019016915932297707, "learning_rate": 9.803333333333334e-07, "loss": 0.001, "num_tokens": 2081032.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 130.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.050935182720422745, "kl": 0.030692865140736103, "learning_rate": 9.8e-07, "loss": 0.0015, "num_tokens": 2081389.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 130.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.701384544372559, "kl": 0.06063641281798482, "learning_rate": 9.796666666666667e-07, "loss": -0.1025, "num_tokens": 2081666.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 130.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.056545305997133255, "kl": 0.03181551210582256, "learning_rate": 9.793333333333333e-07, "loss": 0.0016, "num_tokens": 2082012.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 130.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008005702402442694, "kl": 0.0037714391946792603, "learning_rate": 9.789999999999999e-07, "loss": 0.0002, "num_tokens": 2082248.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 130.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.014514670707285404, "kl": 0.0010657445527613163, "learning_rate": 9.786666666666669e-07, "loss": 0.0001, "num_tokens": 2082526.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 130.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005094868130981922, "kl": 0.00036312639713287354, "learning_rate": 9.783333333333334e-07, "loss": 0.0, "num_tokens": 2082770.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 130.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.005842653568834066, "kl": 0.0036643360799644142, "learning_rate": 9.78e-07, "loss": 0.0002, "num_tokens": 2083028.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 130.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.020485663786530495, "kl": 0.0008285753428936005, "learning_rate": 9.776666666666668e-07, "loss": 0.0, "num_tokens": 2083288.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 130.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.02006685361266136, "kl": 0.0021281553199514747, "learning_rate": 9.773333333333333e-07, "loss": 0.0001, "num_tokens": 2083570.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 130.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.66013765335083, "kl": 0.04869246482849121, "learning_rate": 9.77e-07, "loss": -0.232, "num_tokens": 2083902.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 7070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 130.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.01702677085995674, "kl": 0.0008085608133114874, "learning_rate": 9.766666666666667e-07, "loss": 0.0, "num_tokens": 2084121.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 130.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12956717610359192, "kl": 0.010188494343310595, "learning_rate": 9.763333333333333e-07, "loss": 0.0006, "num_tokens": 2084344.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 130.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.020360322669148445, "kl": 0.0009623808145988733, "learning_rate": 9.759999999999998e-07, "loss": 0.0, "num_tokens": 2084673.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 131.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.07670935243368149, "kl": 0.01936477469280362, "learning_rate": 9.756666666666668e-07, "loss": 0.001, "num_tokens": 2084994.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 131.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.058146726340055466, "kl": 0.01534320879727602, "learning_rate": 9.753333333333334e-07, "loss": 0.0008, "num_tokens": 2085296.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 131.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.024496566504240036, "kl": 0.001151248812675476, "learning_rate": 9.75e-07, "loss": 0.0001, "num_tokens": 2085508.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 131.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.008731354959309101, "kl": 0.0004917159676551819, "learning_rate": 9.746666666666668e-07, "loss": 0.0, "num_tokens": 2085768.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 131.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03717421367764473, "kl": 0.0006443336606025696, "learning_rate": 9.743333333333333e-07, "loss": 0.0, "num_tokens": 2085974.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 131.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.2457912415266037, "kl": 0.12425431609153748, "learning_rate": 9.74e-07, "loss": 0.0062, "num_tokens": 2086346.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 131.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03047761879861355, "kl": 0.004860547371208668, "learning_rate": 9.736666666666667e-07, "loss": 0.0002, "num_tokens": 2086630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 131.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04770100116729736, "kl": 0.006718369899317622, "learning_rate": 9.733333333333333e-07, "loss": 0.0003, "num_tokens": 2086930.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 131.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.021395940333604813, "kl": 0.0018522377649787813, "learning_rate": 9.73e-07, "loss": 0.0001, "num_tokens": 2087218.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 131.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.05242915824055672, "kl": 0.013319097459316254, "learning_rate": 9.726666666666668e-07, "loss": 0.0007, "num_tokens": 2087488.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 131.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10964227467775345, "kl": 0.008151817324687727, "learning_rate": 9.723333333333334e-07, "loss": 0.0004, "num_tokens": 2087707.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 131.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.014443033374845982, "kl": 0.0035891212755814195, "learning_rate": 9.72e-07, "loss": 0.0002, "num_tokens": 2087975.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 131.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.043561384081840515, "kl": 0.008046003174968064, "learning_rate": 9.716666666666668e-07, "loss": 0.0004, "num_tokens": 2088302.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 131.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.05766814574599266, "kl": 0.05627436190843582, "learning_rate": 9.713333333333333e-07, "loss": 0.0028, "num_tokens": 2088639.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 131.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 6.47513588774018e-05, "kl": 2.600252628326416e-06, "learning_rate": 9.709999999999999e-07, "loss": 0.0, "num_tokens": 2088859.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 131.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.464715480804443, "kl": 0.06628647446632385, "learning_rate": 9.706666666666667e-07, "loss": 0.0071, "num_tokens": 2089135.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 7089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 131.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07437316328287125, "kl": 0.021190166473388672, "learning_rate": 9.703333333333332e-07, "loss": 0.0012, "num_tokens": 2089415.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 131.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004481513053178787, "kl": 0.0003196708858013153, "learning_rate": 9.7e-07, "loss": 0.0, "num_tokens": 2089659.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 131.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02414514496922493, "kl": 0.001568454084917903, "learning_rate": 9.696666666666668e-07, "loss": 0.0001, "num_tokens": 2089966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 131.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.058970700949430466, "kl": 0.009768346790224314, "learning_rate": 9.693333333333334e-07, "loss": 0.0006, "num_tokens": 2090349.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 131.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.07728622853755951, "kl": 0.0364050418138504, "learning_rate": 9.69e-07, "loss": 0.0019, "num_tokens": 2090720.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 131.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.08415987342596054, "kl": 0.006773005472496152, "learning_rate": 9.686666666666667e-07, "loss": 0.0003, "num_tokens": 2090988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 131.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 1.8247379064559937, "kl": 0.09288447350263596, "learning_rate": 9.683333333333333e-07, "loss": -0.0913, "num_tokens": 2091337.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 7096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 131.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.017693351954221725, "kl": 0.0019776462577283382, "learning_rate": 9.68e-07, "loss": 0.0001, "num_tokens": 2091619.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 131.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.263444185256958, "kl": 0.05585545673966408, "learning_rate": 9.676666666666667e-07, "loss": 0.0028, "num_tokens": 2091949.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 131.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.276368111371994, "kl": 0.022382110357284546, "learning_rate": 9.673333333333332e-07, "loss": 0.0011, "num_tokens": 2092165.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 131.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 9.272090911865234, "kl": 0.016909361351281404, "learning_rate": 9.67e-07, "loss": 0.1955, "num_tokens": 2092437.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 7100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 131.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.991854190826416, "kl": 0.018948630429804325, "learning_rate": 9.666666666666668e-07, "loss": 0.2911, "num_tokens": 2092722.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 7101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 131.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.29906225204467773, "kl": 0.04519226215779781, "learning_rate": 9.663333333333334e-07, "loss": 0.0024, "num_tokens": 2092998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 131.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.038236718624830246, "kl": 0.0036044390872120857, "learning_rate": 9.660000000000002e-07, "loss": 0.0002, "num_tokens": 2093258.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 131.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.05997953936457634, "kl": 0.047408703714609146, "learning_rate": 9.656666666666667e-07, "loss": 0.0024, "num_tokens": 2093662.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 131.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05586628243327141, "kl": 0.016930708661675453, "learning_rate": 9.653333333333333e-07, "loss": 0.0008, "num_tokens": 2093978.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 131.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.030245967209339142, "kl": 0.15784524381160736, "learning_rate": 9.65e-07, "loss": 0.0079, "num_tokens": 2094289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 131.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.18933509290218353, "kl": 0.018535910174250603, "learning_rate": 9.646666666666666e-07, "loss": 0.001, "num_tokens": 2094563.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 131.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.1340102106332779, "kl": 0.016697907354682684, "learning_rate": 9.643333333333332e-07, "loss": 0.0008, "num_tokens": 2094859.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 131.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04514975845813751, "kl": 0.004168018000200391, "learning_rate": 9.64e-07, "loss": 0.0002, "num_tokens": 2095190.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 131.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.527789831161499, "kl": 0.9675229638814926, "learning_rate": 9.636666666666668e-07, "loss": 0.0478, "num_tokens": 2095497.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 7110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 131.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07461269199848175, "kl": 0.002892457414418459, "learning_rate": 9.633333333333334e-07, "loss": 0.0001, "num_tokens": 2095730.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 131.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.027584195137023926, "kl": 0.00124273075198289, "learning_rate": 9.630000000000001e-07, "loss": 0.0001, "num_tokens": 2096041.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 131.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0327063724398613, "kl": 0.008052623365074396, "learning_rate": 9.626666666666667e-07, "loss": 0.0004, "num_tokens": 2096329.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 131.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.005709430668503046, "kl": 0.0009871545480564237, "learning_rate": 9.623333333333333e-07, "loss": 0.0, "num_tokens": 2096625.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 131.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.013671758584678173, "kl": 0.0005690186808351427, "learning_rate": 9.62e-07, "loss": 0.0, "num_tokens": 2096948.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 131.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.027838649228215218, "kl": 0.0007231563213281333, "learning_rate": 9.616666666666666e-07, "loss": 0.0, "num_tokens": 2097204.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 131.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10903138667345047, "kl": 0.020756863988935947, "learning_rate": 9.613333333333332e-07, "loss": 0.001, "num_tokens": 2097543.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 131.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.282342910766602, "kl": 0.2681761170970276, "learning_rate": 9.610000000000002e-07, "loss": 0.0486, "num_tokens": 2097819.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 131.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.8388426303863525, "kl": 0.8125451585510746, "learning_rate": 9.606666666666668e-07, "loss": 0.0439, "num_tokens": 2098107.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 131.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08289333432912827, "kl": 0.03515140898525715, "learning_rate": 9.603333333333333e-07, "loss": 0.0018, "num_tokens": 2098462.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 131.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.005263836123049259, "kl": 0.00024227624089689925, "learning_rate": 9.600000000000001e-07, "loss": 0.0, "num_tokens": 2098724.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 131.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05346520617604256, "kl": 0.012155882082879543, "learning_rate": 9.596666666666667e-07, "loss": 0.0006, "num_tokens": 2099045.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 131.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.05276263877749443, "kl": 0.02926110289990902, "learning_rate": 9.593333333333333e-07, "loss": 0.0015, "num_tokens": 2099345.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 131.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007885195664130151, "kl": 0.0037667304277420044, "learning_rate": 9.59e-07, "loss": 0.0002, "num_tokens": 2099581.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 131.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.11647625267505646, "kl": 0.002949245972558856, "learning_rate": 9.586666666666666e-07, "loss": 0.0001, "num_tokens": 2099853.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 131.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.033407438546419144, "kl": 0.00026413053274154663, "learning_rate": 9.583333333333332e-07, "loss": 0.0, "num_tokens": 2100065.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 131.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.6440527439117432, "kl": 0.12046198267489672, "learning_rate": 9.580000000000002e-07, "loss": 0.006, "num_tokens": 2100377.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 7127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 132.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.056929267942905426, "kl": 0.05902410298585892, "learning_rate": 9.576666666666668e-07, "loss": 0.0029, "num_tokens": 2100758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 132.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.22061622142791748, "kl": 0.030477201100438833, "learning_rate": 9.573333333333333e-07, "loss": 0.0018, "num_tokens": 2101030.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 132.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.010728205554187298, "kl": 0.007938293274492025, "learning_rate": 9.570000000000001e-07, "loss": 0.0004, "num_tokens": 2101302.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 132.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.01718173548579216, "kl": 0.0008112609793897718, "learning_rate": 9.566666666666667e-07, "loss": 0.0, "num_tokens": 2101592.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 132.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.4237317740917206, "kl": 0.06007047765888274, "learning_rate": 9.563333333333333e-07, "loss": 0.0032, "num_tokens": 2101898.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 132.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.018750177696347237, "kl": 0.000255244696745649, "learning_rate": 9.56e-07, "loss": 0.0, "num_tokens": 2102155.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 132.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.13300053775310516, "kl": 0.01826266571879387, "learning_rate": 9.556666666666666e-07, "loss": 0.0009, "num_tokens": 2102417.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.006708446424454451, "kl": 0.0015731006860733032, "learning_rate": 9.553333333333332e-07, "loss": 0.0001, "num_tokens": 2102633.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 132.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03312021493911743, "kl": 0.0016663968563079834, "learning_rate": 9.550000000000002e-07, "loss": 0.0001, "num_tokens": 2102876.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 132.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.852743625640869, "kl": 0.059035494923591614, "learning_rate": 9.546666666666667e-07, "loss": 0.0064, "num_tokens": 2103250.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 132.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.004768291022628546, "kl": 0.0008493363857269287, "learning_rate": 9.543333333333333e-07, "loss": 0.0, "num_tokens": 2103466.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 132.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0716002807021141, "kl": 0.012565402314066887, "learning_rate": 9.54e-07, "loss": 0.0006, "num_tokens": 2103794.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 132.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.025367960333824158, "kl": 0.05874118395149708, "learning_rate": 9.536666666666667e-07, "loss": 0.0029, "num_tokens": 2104126.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 132.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06814675033092499, "kl": 0.01120592188090086, "learning_rate": 9.533333333333333e-07, "loss": 0.0006, "num_tokens": 2104475.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 132.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05073690041899681, "kl": 0.00727212755009532, "learning_rate": 9.53e-07, "loss": 0.0003, "num_tokens": 2104775.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.1490279734134674, "kl": 0.004201886593364179, "learning_rate": 9.526666666666666e-07, "loss": 0.0002, "num_tokens": 2104989.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 132.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.026240535080432892, "kl": 0.15914025157690048, "learning_rate": 9.523333333333333e-07, "loss": 0.008, "num_tokens": 2105299.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 132.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 1.0895161628723145, "kl": 0.22882658801972866, "learning_rate": 9.520000000000001e-07, "loss": 0.0113, "num_tokens": 2105626.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 132.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.016096575185656548, "kl": 0.003814885189058259, "learning_rate": 9.516666666666667e-07, "loss": 0.0002, "num_tokens": 2105886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 132.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.006233544554561377, "kl": 0.0003956861619371921, "learning_rate": 9.513333333333334e-07, "loss": 0.0, "num_tokens": 2106207.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 132.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.046236991882324, "kl": 0.268627118319273, "learning_rate": 9.510000000000001e-07, "loss": 0.0064, "num_tokens": 2106578.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 7148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 132.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016420314786955714, "kl": 0.0003929436206817627, "learning_rate": 9.506666666666667e-07, "loss": 0.0, "num_tokens": 2106838.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.09429097920656204, "kl": 0.004155139613430947, "learning_rate": 9.503333333333333e-07, "loss": 0.0002, "num_tokens": 2107056.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 132.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.9344552755355835, "kl": 0.03557317424565554, "learning_rate": 9.5e-07, "loss": 0.0397, "num_tokens": 2107375.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 132.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.041324201971292496, "kl": 0.2614790052175522, "learning_rate": 9.496666666666666e-07, "loss": 0.0131, "num_tokens": 2107679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0645325630903244, "kl": 0.000960037112236023, "learning_rate": 9.493333333333333e-07, "loss": 0.0, "num_tokens": 2107891.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 132.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03375959396362305, "kl": 0.0027521795127540827, "learning_rate": 9.49e-07, "loss": 0.0001, "num_tokens": 2108218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 132.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.6301610469818115, "kl": 0.022922604344785213, "learning_rate": 9.486666666666667e-07, "loss": 0.2837, "num_tokens": 2108578.0, "reward": 6.25, "reward_std": 2.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 2.5, "step": 7155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 132.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1570698767900467, "kl": 0.03986375965178013, "learning_rate": 9.483333333333334e-07, "loss": 0.002, "num_tokens": 2108853.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 132.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.3675270080566406, "kl": 0.07046718522906303, "learning_rate": 9.480000000000001e-07, "loss": 0.0045, "num_tokens": 2109250.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 7157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 132.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.015076756477355957, "kl": 0.0035436644102446735, "learning_rate": 9.476666666666666e-07, "loss": 0.0002, "num_tokens": 2109518.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.048188216984272, "kl": 0.004340869025327265, "learning_rate": 9.473333333333333e-07, "loss": 0.0002, "num_tokens": 2109795.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 132.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.018355129286646843, "kl": 0.0006780156109016389, "learning_rate": 9.47e-07, "loss": 0.0, "num_tokens": 2110031.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 132.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0468345507979393, "kl": 0.007834693882614374, "learning_rate": 9.466666666666667e-07, "loss": 0.0004, "num_tokens": 2110315.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 132.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.8543777465820312, "kl": 0.04336899612098932, "learning_rate": 9.463333333333335e-07, "loss": -0.0715, "num_tokens": 2110607.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 7162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 132.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016212743939831853, "kl": 0.0002623516629682854, "learning_rate": 9.460000000000001e-07, "loss": 0.0, "num_tokens": 2110921.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03348354622721672, "kl": 0.0027104535838589072, "learning_rate": 9.456666666666667e-07, "loss": 0.0001, "num_tokens": 2111219.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008736312738619745, "kl": 0.003747418522834778, "learning_rate": 9.453333333333334e-07, "loss": 0.0002, "num_tokens": 2111455.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.131101608276367, "kl": 0.05928567633964121, "learning_rate": 9.450000000000001e-07, "loss": 0.0233, "num_tokens": 2111728.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 132.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.020665723830461502, "kl": 0.006254879139305558, "learning_rate": 9.446666666666666e-07, "loss": 0.0003, "num_tokens": 2112000.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 132.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.08057216554880142, "kl": 0.006814703578129411, "learning_rate": 9.443333333333333e-07, "loss": 0.0003, "num_tokens": 2112263.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7168 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.01315789483487606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01315789483487606, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 132.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.7153453826904297, "kl": 0.06348188780248165, "learning_rate": 9.44e-07, "loss": 0.0156, "num_tokens": 2112564.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 132.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.00875805038958788, "kl": 0.0006303263508016244, "learning_rate": 9.436666666666667e-07, "loss": 0.0, "num_tokens": 2112840.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.044893015176057816, "kl": 0.006651686737313867, "learning_rate": 9.433333333333334e-07, "loss": 0.0003, "num_tokens": 2113110.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.059397123754024506, "kl": 0.012119903694838285, "learning_rate": 9.430000000000001e-07, "loss": 0.0006, "num_tokens": 2113390.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003593391156755388, "kl": 0.00020550936460494995, "learning_rate": 9.426666666666667e-07, "loss": 0.0, "num_tokens": 2113610.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 132.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.021664176136255264, "kl": 0.0019108533742837608, "learning_rate": 9.423333333333334e-07, "loss": 0.0001, "num_tokens": 2113870.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 132.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.13996350765228271, "kl": 0.02745542861521244, "learning_rate": 9.42e-07, "loss": 0.0015, "num_tokens": 2114210.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7175 }, { "clip_ratio/high_max": 0.012195121496915817, "clip_ratio/high_mean": 0.012195121496915817, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012195121496915817, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 132.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 17.1121768951416, "kl": 1.65606177598238, "learning_rate": 9.416666666666667e-07, "loss": 0.0882, "num_tokens": 2114518.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 7176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03864877671003342, "kl": 0.0028798532439395785, "learning_rate": 9.413333333333333e-07, "loss": 0.0001, "num_tokens": 2114798.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 132.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06892888247966766, "kl": 0.012835131026804447, "learning_rate": 9.41e-07, "loss": 0.0006, "num_tokens": 2115112.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 132.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 3.4013471603393555, "kl": 0.09095125645399094, "learning_rate": 9.406666666666666e-07, "loss": -0.0371, "num_tokens": 2115474.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 132.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.048363130539655685, "kl": 0.010076596401631832, "learning_rate": 9.403333333333334e-07, "loss": 0.0005, "num_tokens": 2115765.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 132.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 0.6886708736419678, "kl": 0.07726818695664406, "learning_rate": 9.400000000000001e-07, "loss": 0.0109, "num_tokens": 2116171.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 7181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 133.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08039382845163345, "kl": 0.018315540626645088, "learning_rate": 9.396666666666667e-07, "loss": 0.0009, "num_tokens": 2116518.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 133.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03761099651455879, "kl": 0.005296449642628431, "learning_rate": 9.393333333333334e-07, "loss": 0.0003, "num_tokens": 2116786.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 133.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01886494643986225, "kl": 0.0023565638111904263, "learning_rate": 9.39e-07, "loss": 0.0001, "num_tokens": 2117068.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 133.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.13610948622226715, "kl": 0.01815592311322689, "learning_rate": 9.386666666666667e-07, "loss": 0.0009, "num_tokens": 2117344.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 133.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.31861409544944763, "kl": 0.03766192775219679, "learning_rate": 9.383333333333333e-07, "loss": 0.0019, "num_tokens": 2117615.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 133.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.002420982113108039, "kl": 2.590566873550415e-05, "learning_rate": 9.38e-07, "loss": 0.0, "num_tokens": 2117827.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 133.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06352825462818146, "kl": 0.001503325649537146, "learning_rate": 9.376666666666666e-07, "loss": 0.0001, "num_tokens": 2118061.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 133.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03194012865424156, "kl": 0.011991779319941998, "learning_rate": 9.373333333333334e-07, "loss": 0.0006, "num_tokens": 2118333.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 133.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.7620279788970947, "kl": 0.09522133693099022, "learning_rate": 9.370000000000001e-07, "loss": -0.0202, "num_tokens": 2118699.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 133.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.08648505061864853, "kl": 0.006787155929487199, "learning_rate": 9.366666666666668e-07, "loss": 0.0003, "num_tokens": 2119025.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 133.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01592927612364292, "kl": 0.00040593318408355117, "learning_rate": 9.363333333333333e-07, "loss": 0.0, "num_tokens": 2119305.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 133.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.010431910865008831, "kl": 0.0008702820050530136, "learning_rate": 9.36e-07, "loss": 0.0, "num_tokens": 2119573.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 133.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.2150237262248993, "kl": 0.05814521200954914, "learning_rate": 9.356666666666667e-07, "loss": 0.003, "num_tokens": 2119863.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 133.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 5.396121501922607, "kl": 0.033672990277409554, "learning_rate": 9.353333333333334e-07, "loss": 0.1315, "num_tokens": 2120186.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 133.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.009516783989965916, "kl": 0.00034907087683677673, "learning_rate": 9.349999999999999e-07, "loss": 0.0, "num_tokens": 2120430.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 133.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03322471305727959, "kl": 0.0030200803303159773, "learning_rate": 9.346666666666666e-07, "loss": 0.0002, "num_tokens": 2120728.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 133.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.036663591861724854, "kl": 0.00414731225464493, "learning_rate": 9.343333333333334e-07, "loss": 0.0002, "num_tokens": 2121030.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 133.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08941317349672318, "kl": 0.011213188990950584, "learning_rate": 9.340000000000001e-07, "loss": 0.0006, "num_tokens": 2121308.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 93.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 93.75, "completions/mean_terminated_length": 39.66666793823242, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 133.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.2320513725280762, "kl": 0.06742130592465401, "learning_rate": 9.336666666666668e-07, "loss": 0.0537, "num_tokens": 2121899.0, "reward": 4.175000190734863, "reward_std": 4.4485015869140625, "rewards/reward_combined/mean": 4.175000190734863, "rewards/reward_combined/std": 4.4485015869140625, "step": 7200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 133.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.058102354407310486, "kl": 0.010958315804600716, "learning_rate": 9.333333333333333e-07, "loss": 0.0005, "num_tokens": 2122211.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 133.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.034183163195848465, "kl": 0.1602887436747551, "learning_rate": 9.33e-07, "loss": 0.008, "num_tokens": 2122521.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 133.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.08965833485126495, "kl": 0.012937887106090784, "learning_rate": 9.326666666666667e-07, "loss": 0.0007, "num_tokens": 2122794.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 133.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007982464740052819, "kl": 0.00376911461353302, "learning_rate": 9.323333333333334e-07, "loss": 0.0002, "num_tokens": 2123030.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 133.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.019955292344093323, "kl": 0.000316062563797459, "learning_rate": 9.319999999999999e-07, "loss": 0.0, "num_tokens": 2123286.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 133.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.010446756146848202, "kl": 0.0006545372307300568, "learning_rate": 9.316666666666666e-07, "loss": 0.0, "num_tokens": 2123546.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 133.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06672053039073944, "kl": 0.02910972759127617, "learning_rate": 9.313333333333334e-07, "loss": 0.0015, "num_tokens": 2123912.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 133.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.22056028246879578, "kl": 0.06442374363541603, "learning_rate": 9.310000000000001e-07, "loss": 0.0034, "num_tokens": 2124246.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 133.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.048039067536592484, "kl": 0.008211891632527113, "learning_rate": 9.306666666666667e-07, "loss": 0.0004, "num_tokens": 2124570.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 133.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09931541234254837, "kl": 0.022510704584419727, "learning_rate": 9.303333333333334e-07, "loss": 0.0012, "num_tokens": 2124873.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 133.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06466849148273468, "kl": 0.008924984140321612, "learning_rate": 9.3e-07, "loss": 0.0004, "num_tokens": 2125193.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 133.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.004334203898906708, "kl": 0.0012924571637995541, "learning_rate": 9.296666666666667e-07, "loss": 0.0001, "num_tokens": 2125409.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 133.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.001046365941874683, "kl": 0.0013271399657242, "learning_rate": 9.293333333333333e-07, "loss": 0.0001, "num_tokens": 2125689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 133.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.009608503431081772, "kl": 0.0035395300947129726, "learning_rate": 9.289999999999999e-07, "loss": 0.0002, "num_tokens": 2125980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 133.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0517580583691597, "kl": 0.2592521905899048, "learning_rate": 9.286666666666666e-07, "loss": 0.013, "num_tokens": 2126284.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 133.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04558124765753746, "kl": 0.04163266532123089, "learning_rate": 9.283333333333334e-07, "loss": 0.0021, "num_tokens": 2126689.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 133.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026923806872218847, "kl": 0.00033708091359585524, "learning_rate": 9.28e-07, "loss": 0.0, "num_tokens": 2126951.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 133.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.019567392766475677, "kl": 0.04918581433594227, "learning_rate": 9.276666666666667e-07, "loss": 0.0025, "num_tokens": 2127283.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 133.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.037634242326021194, "kl": 0.004057594807818532, "learning_rate": 9.273333333333334e-07, "loss": 0.0002, "num_tokens": 2127581.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 133.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.990529179573059, "kl": 0.016978265717625618, "learning_rate": 9.27e-07, "loss": -0.0566, "num_tokens": 2127973.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 7220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 133.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.018729979172348976, "kl": 0.012191944755613804, "learning_rate": 9.266666666666667e-07, "loss": 0.0006, "num_tokens": 2128233.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 133.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.01827273704111576, "kl": 0.0008070696494542062, "learning_rate": 9.263333333333333e-07, "loss": 0.0, "num_tokens": 2128525.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 133.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.22310420870780945, "kl": 0.021159586030989885, "learning_rate": 9.26e-07, "loss": 0.0011, "num_tokens": 2128791.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 133.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.012367135845124722, "kl": 0.000512710539624095, "learning_rate": 9.256666666666668e-07, "loss": 0.0, "num_tokens": 2129112.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 133.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 3.0059858545428142e-05, "kl": 2.086162567138672e-06, "learning_rate": 9.253333333333335e-07, "loss": 0.0, "num_tokens": 2129332.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 133.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06855064630508423, "kl": 0.020937534049153328, "learning_rate": 9.25e-07, "loss": 0.001, "num_tokens": 2129664.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 133.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03979175537824631, "kl": 0.008876292733475566, "learning_rate": 9.246666666666667e-07, "loss": 0.0005, "num_tokens": 2130003.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 133.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08385109156370163, "kl": 0.035915348678827286, "learning_rate": 9.243333333333334e-07, "loss": 0.0018, "num_tokens": 2130305.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 133.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.11219722032546997, "kl": 0.01823048759251833, "learning_rate": 9.24e-07, "loss": 0.0009, "num_tokens": 2130601.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 133.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.003798265242949128, "kl": 0.0012842849391745403, "learning_rate": 9.236666666666666e-07, "loss": 0.0001, "num_tokens": 2130820.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 133.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 11.193485260009766, "kl": 0.005961761809885502, "learning_rate": 9.233333333333333e-07, "loss": 0.2998, "num_tokens": 2131047.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 133.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.17145322263240814, "kl": 0.01510188402608037, "learning_rate": 9.23e-07, "loss": 0.0008, "num_tokens": 2131309.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 133.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 1.0979493856430054, "kl": 0.21211452782154083, "learning_rate": 9.226666666666668e-07, "loss": -0.0084, "num_tokens": 2131679.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 7233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 133.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.017782876268029213, "kl": 0.0005536973476409912, "learning_rate": 9.223333333333335e-07, "loss": 0.0, "num_tokens": 2131889.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 133.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.012977372854948044, "kl": 0.0006392856448655948, "learning_rate": 9.22e-07, "loss": 0.0, "num_tokens": 2132198.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 134.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03445803374052048, "kl": 0.0052281885873526335, "learning_rate": 9.216666666666667e-07, "loss": 0.0003, "num_tokens": 2132456.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 134.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.016842296347022057, "kl": 0.002088193374220282, "learning_rate": 9.213333333333334e-07, "loss": 0.0001, "num_tokens": 2132726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 134.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.2457070350646973, "kl": 0.04580681957304478, "learning_rate": 9.210000000000001e-07, "loss": 0.0706, "num_tokens": 2133017.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 134.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02117704600095749, "kl": 0.005317485425621271, "learning_rate": 9.206666666666666e-07, "loss": 0.0003, "num_tokens": 2133347.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 134.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.02129325270652771, "kl": 0.00043713750346796587, "learning_rate": 9.203333333333333e-07, "loss": 0.0, "num_tokens": 2133615.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 134.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02226569689810276, "kl": 0.00281720410566777, "learning_rate": 9.2e-07, "loss": 0.0001, "num_tokens": 2133897.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 134.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.23504067957401276, "kl": 0.06554269045591354, "learning_rate": 9.196666666666668e-07, "loss": 0.0035, "num_tokens": 2134264.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 134.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.200326919555664, "kl": 0.16619390342384577, "learning_rate": 9.193333333333334e-07, "loss": -0.2013, "num_tokens": 2134627.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 134.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.010768643580377102, "kl": 0.0005014777125325054, "learning_rate": 9.19e-07, "loss": 0.0, "num_tokens": 2134947.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 134.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03616509214043617, "kl": 0.0031698253005743027, "learning_rate": 9.186666666666667e-07, "loss": 0.0001, "num_tokens": 2135224.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 134.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.039958786219358444, "kl": 0.005971988663077354, "learning_rate": 9.183333333333334e-07, "loss": 0.0003, "num_tokens": 2135524.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 134.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.004152415785938501, "kl": 0.00035480037331581116, "learning_rate": 9.18e-07, "loss": 0.0, "num_tokens": 2135768.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 134.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.049710825085639954, "kl": 0.008523158729076385, "learning_rate": 9.176666666666666e-07, "loss": 0.0004, "num_tokens": 2136101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 134.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.022367453202605247, "kl": 0.15715914964675903, "learning_rate": 9.173333333333333e-07, "loss": 0.0079, "num_tokens": 2136412.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 134.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05105190724134445, "kl": 0.25952573120594025, "learning_rate": 9.17e-07, "loss": 0.013, "num_tokens": 2136716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 134.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.11837867647409439, "kl": 0.022399356588721275, "learning_rate": 9.166666666666667e-07, "loss": 0.0011, "num_tokens": 2137006.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 134.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.053479500114917755, "kl": 0.03572300262749195, "learning_rate": 9.163333333333334e-07, "loss": 0.0018, "num_tokens": 2137343.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 134.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.022783905267715454, "kl": 0.0011843194661196321, "learning_rate": 9.160000000000001e-07, "loss": 0.0, "num_tokens": 2137559.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 134.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 3.229832172393799, "kl": 0.02698578219860792, "learning_rate": 9.156666666666667e-07, "loss": 0.3546, "num_tokens": 2137895.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 7254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 134.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.026810159906744957, "kl": 0.002930235117673874, "learning_rate": 9.153333333333334e-07, "loss": 0.0001, "num_tokens": 2138222.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 134.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.003228947054594755, "kl": 0.0002876996877603233, "learning_rate": 9.15e-07, "loss": 0.0, "num_tokens": 2138442.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 134.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.014878548681735992, "kl": 0.0009900970035232604, "learning_rate": 9.146666666666666e-07, "loss": 0.0, "num_tokens": 2138677.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 134.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 2.765254020690918, "kl": 0.1371312439441681, "learning_rate": 9.143333333333333e-07, "loss": 0.1271, "num_tokens": 2139016.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 134.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 5.0661940574646, "kl": 0.0509836096316576, "learning_rate": 9.14e-07, "loss": 0.0834, "num_tokens": 2139317.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 134.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.012088281102478504, "kl": 0.09811355918645859, "learning_rate": 9.136666666666667e-07, "loss": 0.0049, "num_tokens": 2139689.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 134.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.98696231842041, "kl": 0.07401011791080236, "learning_rate": 9.133333333333334e-07, "loss": 0.0665, "num_tokens": 2139996.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 7261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 134.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.030476661399006844, "kl": 0.0017551222117617726, "learning_rate": 9.130000000000001e-07, "loss": 0.0001, "num_tokens": 2140290.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 134.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.5068140029907227, "kl": 0.12856233259662986, "learning_rate": 9.126666666666667e-07, "loss": 0.0061, "num_tokens": 2140612.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 134.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058793858624994755, "kl": 0.0003757834492716938, "learning_rate": 9.123333333333333e-07, "loss": 0.0, "num_tokens": 2140872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 134.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.046370524913072586, "kl": 0.0031247527804225683, "learning_rate": 9.12e-07, "loss": 0.0002, "num_tokens": 2141138.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 134.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.08554859459400177, "kl": 0.007047143764793873, "learning_rate": 9.116666666666667e-07, "loss": 0.0004, "num_tokens": 2141398.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 134.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.009564424864947796, "kl": 0.0022046566009521484, "learning_rate": 9.113333333333333e-07, "loss": 0.0001, "num_tokens": 2141614.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 134.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04136870428919792, "kl": 0.013337010983377695, "learning_rate": 9.109999999999999e-07, "loss": 0.0007, "num_tokens": 2141888.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 134.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.23459157347679138, "kl": 0.09211067855358124, "learning_rate": 9.106666666666667e-07, "loss": 0.0047, "num_tokens": 2142263.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 134.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.816853940486908, "kl": 0.21860068291425705, "learning_rate": 9.103333333333334e-07, "loss": 0.0102, "num_tokens": 2142610.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 134.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05367870628833771, "kl": 0.013943172059953213, "learning_rate": 9.100000000000001e-07, "loss": 0.0007, "num_tokens": 2142924.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 134.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.054747845977544785, "kl": 0.0009156376127066324, "learning_rate": 9.096666666666668e-07, "loss": 0.0, "num_tokens": 2143137.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 134.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.14112626016139984, "kl": 0.018187306355684996, "learning_rate": 9.093333333333333e-07, "loss": 0.0009, "num_tokens": 2143411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 134.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07531093060970306, "kl": 0.02200077986344695, "learning_rate": 9.09e-07, "loss": 0.0011, "num_tokens": 2143681.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 134.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014039292931556702, "kl": 0.00014556049791281112, "learning_rate": 9.086666666666667e-07, "loss": 0.0, "num_tokens": 2143937.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 134.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.003431979101151228, "kl": 0.00029359757900238037, "learning_rate": 9.083333333333332e-07, "loss": 0.0, "num_tokens": 2144197.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 134.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.004764489829540253, "kl": 0.00020127611060161144, "learning_rate": 9.079999999999999e-07, "loss": 0.0, "num_tokens": 2144513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 134.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.036810729652643204, "kl": 0.040626462548971176, "learning_rate": 9.076666666666667e-07, "loss": 0.002, "num_tokens": 2144918.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 134.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 4.5539378334069625e-05, "kl": 2.5853514671325684e-06, "learning_rate": 9.073333333333334e-07, "loss": 0.0, "num_tokens": 2145138.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 134.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.22406013309955597, "kl": 0.018947109580039978, "learning_rate": 9.070000000000001e-07, "loss": 0.001, "num_tokens": 2145436.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 134.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.05794462189078331, "kl": 0.006478884955868125, "learning_rate": 9.066666666666667e-07, "loss": 0.0003, "num_tokens": 2145696.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 134.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09144249558448792, "kl": 0.044348349794745445, "learning_rate": 9.063333333333333e-07, "loss": 0.0023, "num_tokens": 2145987.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 134.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.018856236711144447, "kl": 0.012103037908673286, "learning_rate": 9.06e-07, "loss": 0.0006, "num_tokens": 2146247.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 134.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.025597816333174706, "kl": 0.000511661171913147, "learning_rate": 9.056666666666667e-07, "loss": 0.0, "num_tokens": 2146457.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 134.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.13349851965904236, "kl": 0.0038930486189201474, "learning_rate": 9.053333333333333e-07, "loss": 0.0002, "num_tokens": 2146725.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 134.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.017097294330596924, "kl": 0.004612116841599345, "learning_rate": 9.050000000000001e-07, "loss": 0.0002, "num_tokens": 2147015.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 134.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03454576060175896, "kl": 0.002968351007439196, "learning_rate": 9.046666666666668e-07, "loss": 0.0001, "num_tokens": 2147317.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 134.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008546649478375912, "kl": 0.0037566646933555603, "learning_rate": 9.043333333333334e-07, "loss": 0.0002, "num_tokens": 2147553.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 134.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.6142919063568115, "kl": 0.09330939501523972, "learning_rate": 9.04e-07, "loss": 0.103, "num_tokens": 2147897.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 135.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.05476091057062149, "kl": 0.022868791595101357, "learning_rate": 9.036666666666667e-07, "loss": 0.0011, "num_tokens": 2148196.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 135.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 9.067389488220215, "kl": 0.02682997426018119, "learning_rate": 9.033333333333333e-07, "loss": 0.2163, "num_tokens": 2148469.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 135.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.8548285961151123, "kl": 0.11683328449726105, "learning_rate": 9.03e-07, "loss": 0.0052, "num_tokens": 2148841.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 7292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 135.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.28243163228034973, "kl": 0.06176641955971718, "learning_rate": 9.026666666666666e-07, "loss": 0.003, "num_tokens": 2149162.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 135.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.1634560376405716, "kl": 0.016710239462554455, "learning_rate": 9.023333333333333e-07, "loss": 0.0009, "num_tokens": 2149438.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 135.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.016343019902706146, "kl": 0.00015169084508670494, "learning_rate": 9.020000000000001e-07, "loss": 0.0, "num_tokens": 2149694.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 135.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06792984157800674, "kl": 0.004617521073669195, "learning_rate": 9.016666666666668e-07, "loss": 0.0002, "num_tokens": 2149971.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 135.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.10269705206155777, "kl": 0.01501919748261571, "learning_rate": 9.013333333333334e-07, "loss": 0.0008, "num_tokens": 2150313.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 135.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.11704476922750473, "kl": 0.042291984893381596, "learning_rate": 9.01e-07, "loss": 0.0017, "num_tokens": 2150683.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 135.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.1805291175842285, "kl": 0.10104147903621197, "learning_rate": 9.006666666666667e-07, "loss": -0.0512, "num_tokens": 2150981.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 7299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 135.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.023492639884352684, "kl": 0.001130125325289555, "learning_rate": 9.003333333333334e-07, "loss": 0.0001, "num_tokens": 2151216.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 135.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02734074741601944, "kl": 0.006692580878734589, "learning_rate": 9e-07, "loss": 0.0004, "num_tokens": 2151486.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.00541269313544035, "kl": 0.00010779500007629395, "learning_rate": 8.996666666666666e-07, "loss": 0.0, "num_tokens": 2151698.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 135.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.02181081846356392, "kl": 0.0005224151827860624, "learning_rate": 8.993333333333333e-07, "loss": 0.0, "num_tokens": 2152014.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.02816421538591385, "kl": 0.0009589850669726729, "learning_rate": 8.990000000000001e-07, "loss": 0.0, "num_tokens": 2152233.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7304 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0059523810632526875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 135.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.6694557666778564, "kl": 0.16646184399724007, "learning_rate": 8.986666666666668e-07, "loss": -0.0705, "num_tokens": 2152594.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 7305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 3.008952808158938e-05, "kl": 2.205371856689453e-06, "learning_rate": 8.983333333333333e-07, "loss": 0.0, "num_tokens": 2152814.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 135.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05522434040904045, "kl": 0.25461290776729584, "learning_rate": 8.98e-07, "loss": 0.0127, "num_tokens": 2153119.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 135.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03750606253743172, "kl": 0.00523244240321219, "learning_rate": 8.976666666666667e-07, "loss": 0.0003, "num_tokens": 2153387.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 135.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.013643357902765274, "kl": 0.0016249477630481124, "learning_rate": 8.973333333333334e-07, "loss": 0.0001, "num_tokens": 2153669.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 135.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.06837660074234009, "kl": 0.009281745064072311, "learning_rate": 8.969999999999999e-07, "loss": 0.0005, "num_tokens": 2153960.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 135.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06455466151237488, "kl": 0.014946409035474062, "learning_rate": 8.966666666666666e-07, "loss": 0.0007, "num_tokens": 2154282.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 135.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.06256791949272156, "kl": 0.010306134354323149, "learning_rate": 8.963333333333333e-07, "loss": 0.0005, "num_tokens": 2154596.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 135.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.4787709712982178, "kl": 0.06068704463541508, "learning_rate": 8.960000000000001e-07, "loss": 0.1584, "num_tokens": 2154950.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 7313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 135.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.06644254922866821, "kl": 0.013334487099200487, "learning_rate": 8.956666666666668e-07, "loss": 0.0007, "num_tokens": 2155228.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 135.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.7882707118988037, "kl": 0.4084884971380234, "learning_rate": 8.953333333333334e-07, "loss": -0.019, "num_tokens": 2155607.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 135.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05079388618469238, "kl": 0.0033153147669509053, "learning_rate": 8.95e-07, "loss": 0.0002, "num_tokens": 2155875.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 135.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05334262549877167, "kl": 0.009435899555683136, "learning_rate": 8.946666666666667e-07, "loss": 0.0004, "num_tokens": 2156167.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 135.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.543372869491577, "kl": 0.1856471374630928, "learning_rate": 8.943333333333334e-07, "loss": 0.0529, "num_tokens": 2156480.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 135.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.043067336082458496, "kl": 0.016864736913703382, "learning_rate": 8.939999999999999e-07, "loss": 0.0009, "num_tokens": 2156766.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 135.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.10759197920560837, "kl": 0.04988580569624901, "learning_rate": 8.936666666666666e-07, "loss": 0.0024, "num_tokens": 2157129.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 135.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.029751647263765335, "kl": 0.005605928134173155, "learning_rate": 8.933333333333333e-07, "loss": 0.0003, "num_tokens": 2157403.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.1020454540848732, "kl": 0.010115991695784032, "learning_rate": 8.930000000000001e-07, "loss": 0.0003, "num_tokens": 2157657.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 135.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06299417465925217, "kl": 0.010090203722938895, "learning_rate": 8.926666666666667e-07, "loss": 0.0005, "num_tokens": 2157984.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 135.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.022969236597418785, "kl": 0.004164924961514771, "learning_rate": 8.923333333333334e-07, "loss": 0.0002, "num_tokens": 2158290.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 135.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.030619587749242783, "kl": 0.0006797239184379578, "learning_rate": 8.92e-07, "loss": 0.0, "num_tokens": 2158496.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 135.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.761586904525757, "kl": 0.08104568719863892, "learning_rate": 8.916666666666667e-07, "loss": 0.0837, "num_tokens": 2158796.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 7326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 10.748947143554688, "kl": 0.049962278455495834, "learning_rate": 8.913333333333333e-07, "loss": -0.1669, "num_tokens": 2159036.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 7327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 135.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06620163470506668, "kl": 0.011506594251841307, "learning_rate": 8.91e-07, "loss": 0.0006, "num_tokens": 2159362.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 135.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01109394896775484, "kl": 0.0008406780543737113, "learning_rate": 8.906666666666666e-07, "loss": 0.0, "num_tokens": 2159636.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 135.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04895327240228653, "kl": 0.006518984911963344, "learning_rate": 8.903333333333333e-07, "loss": 0.0003, "num_tokens": 2159926.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 135.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.219571352005005, "kl": 0.1056404709815979, "learning_rate": 8.900000000000001e-07, "loss": 0.0334, "num_tokens": 2160265.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 135.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.037076301872730255, "kl": 0.010937594808638096, "learning_rate": 8.896666666666667e-07, "loss": 0.0005, "num_tokens": 2160587.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 135.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.026273062452673912, "kl": 0.03700084798038006, "learning_rate": 8.893333333333334e-07, "loss": 0.0019, "num_tokens": 2161000.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 135.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04364119842648506, "kl": 0.007080773822963238, "learning_rate": 8.890000000000001e-07, "loss": 0.0004, "num_tokens": 2161268.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 135.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.26789894700050354, "kl": 0.04030096344649792, "learning_rate": 8.886666666666667e-07, "loss": 0.0019, "num_tokens": 2161559.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 135.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.008967450819909573, "kl": 0.0005120581045048311, "learning_rate": 8.883333333333333e-07, "loss": 0.0, "num_tokens": 2161878.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008881927351467311, "kl": 0.0037580057978630066, "learning_rate": 8.88e-07, "loss": 0.0002, "num_tokens": 2162114.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 135.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0976402536034584, "kl": 0.03780166991055012, "learning_rate": 8.876666666666666e-07, "loss": 0.0019, "num_tokens": 2162416.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 135.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.014479395933449268, "kl": 0.0005550645291805267, "learning_rate": 8.873333333333333e-07, "loss": 0.0, "num_tokens": 2162676.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 135.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.024733468890190125, "kl": 0.001297086477279663, "learning_rate": 8.87e-07, "loss": 0.0001, "num_tokens": 2162888.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 135.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.019985897466540337, "kl": 0.011954582296311855, "learning_rate": 8.866666666666667e-07, "loss": 0.0006, "num_tokens": 2163148.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 135.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.002589184558019042, "kl": 0.0002625075576361269, "learning_rate": 8.863333333333334e-07, "loss": 0.0, "num_tokens": 2163410.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008975645527243614, "kl": 0.001971140503883362, "learning_rate": 8.860000000000001e-07, "loss": 0.0001, "num_tokens": 2163626.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 136.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09261632710695267, "kl": 0.024340140633285046, "learning_rate": 8.856666666666666e-07, "loss": 0.0012, "num_tokens": 2163900.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 136.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.006089119706302881, "kl": 0.00023233643150888383, "learning_rate": 8.853333333333333e-07, "loss": 0.0, "num_tokens": 2164172.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03549829497933388, "kl": 0.0017097890377044678, "learning_rate": 8.85e-07, "loss": 0.0001, "num_tokens": 2164470.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.004759668372571468, "kl": 5.406886339187622e-05, "learning_rate": 8.846666666666667e-07, "loss": 0.0, "num_tokens": 2164682.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04371841624379158, "kl": 0.002771449158899486, "learning_rate": 8.843333333333335e-07, "loss": 0.0001, "num_tokens": 2164936.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 136.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.09800943732261658, "kl": 0.017581870779395103, "learning_rate": 8.840000000000001e-07, "loss": 0.0009, "num_tokens": 2165216.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 136.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.9392058253288269, "kl": 0.1343661308346782, "learning_rate": 8.836666666666667e-07, "loss": 0.0073, "num_tokens": 2165519.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 136.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.05032603442668915, "kl": 0.003250275389291346, "learning_rate": 8.833333333333334e-07, "loss": 0.0002, "num_tokens": 2165787.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 3.7639427318936214e-05, "kl": 2.6226043701171875e-06, "learning_rate": 8.830000000000001e-07, "loss": 0.0, "num_tokens": 2166007.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 136.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.272624492645264, "kl": 0.07689910009503365, "learning_rate": 8.826666666666666e-07, "loss": -0.1998, "num_tokens": 2166319.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.004807692486792803, "clip_ratio/region_mean": 0.004807692486792803, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 136.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.5374581813812256, "kl": 0.026032373309135437, "learning_rate": 8.823333333333333e-07, "loss": -0.0007, "num_tokens": 2166684.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 7354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 136.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.366523027420044, "kl": 0.01075270283035934, "learning_rate": 8.82e-07, "loss": -0.0768, "num_tokens": 2167019.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 136.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 1.4346952438354492, "kl": 0.5086937826126814, "learning_rate": 8.816666666666667e-07, "loss": 0.0659, "num_tokens": 2167309.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 136.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06944682449102402, "kl": 0.008605902519775555, "learning_rate": 8.813333333333334e-07, "loss": 0.0004, "num_tokens": 2167632.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 136.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015439038397744298, "kl": 0.00028729066252708435, "learning_rate": 8.810000000000001e-07, "loss": 0.0, "num_tokens": 2167892.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 136.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05211462453007698, "kl": 0.011499294079840183, "learning_rate": 8.806666666666667e-07, "loss": 0.0006, "num_tokens": 2168220.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 136.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.038138214498758316, "kl": 0.06741504743695259, "learning_rate": 8.803333333333334e-07, "loss": 0.0034, "num_tokens": 2168552.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 136.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.326338529586792, "kl": 0.07935124635696411, "learning_rate": 8.8e-07, "loss": 0.0111, "num_tokens": 2168929.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 136.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.016121793538331985, "kl": 0.0006826934622949921, "learning_rate": 8.796666666666667e-07, "loss": 0.0, "num_tokens": 2169191.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 136.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.003704465925693512, "kl": 0.00028383731842041016, "learning_rate": 8.793333333333333e-07, "loss": 0.0, "num_tokens": 2169411.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.209465980529785, "kl": 0.03110265452414751, "learning_rate": 8.79e-07, "loss": 0.3676, "num_tokens": 2169743.0, "reward": 7.800000190734863, "reward_std": 0.40000009536743164, "rewards/reward_combined/mean": 7.800000190734863, "rewards/reward_combined/std": 0.40000009536743164, "step": 7364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 136.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.920591354370117, "kl": 0.02079156506806612, "learning_rate": 8.786666666666666e-07, "loss": 0.0745, "num_tokens": 2170074.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 7365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 136.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 15.74850845336914, "kl": 0.043286800384521484, "learning_rate": 8.783333333333334e-07, "loss": -0.0034, "num_tokens": 2170280.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 7366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 136.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.015274131670594215, "kl": 0.025646859779953957, "learning_rate": 8.780000000000001e-07, "loss": 0.0013, "num_tokens": 2170634.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 136.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.006005997769534588, "kl": 0.00012720064114546403, "learning_rate": 8.776666666666667e-07, "loss": 0.0, "num_tokens": 2170891.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 136.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.020352181047201157, "kl": 0.09921921417117119, "learning_rate": 8.773333333333334e-07, "loss": 0.005, "num_tokens": 2171263.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 136.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04101250320672989, "kl": 0.002260155975818634, "learning_rate": 8.77e-07, "loss": 0.0001, "num_tokens": 2171531.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 136.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.030056556686758995, "kl": 0.0007235308876261115, "learning_rate": 8.766666666666667e-07, "loss": 0.0, "num_tokens": 2171747.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.645255088806152, "kl": 0.1835203063674271, "learning_rate": 8.763333333333333e-07, "loss": 0.0001, "num_tokens": 2172019.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 136.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009333707392215729, "kl": 0.0005047744052717462, "learning_rate": 8.76e-07, "loss": 0.0, "num_tokens": 2172337.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0256818775087595, "kl": 0.0023071628529578447, "learning_rate": 8.756666666666666e-07, "loss": 0.0001, "num_tokens": 2172614.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 136.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025416959542781115, "kl": 0.00013458231842378154, "learning_rate": 8.753333333333334e-07, "loss": 0.0, "num_tokens": 2172926.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 136.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03927861899137497, "kl": 0.0072903805412352085, "learning_rate": 8.750000000000001e-07, "loss": 0.0004, "num_tokens": 2173214.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00921502336859703, "kl": 0.0003606189420679584, "learning_rate": 8.746666666666668e-07, "loss": 0.0, "num_tokens": 2173482.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.008314108476042747, "kl": 0.0016662552952766418, "learning_rate": 8.743333333333333e-07, "loss": 0.0001, "num_tokens": 2173698.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.028388969600200653, "kl": 0.0032668503699824214, "learning_rate": 8.74e-07, "loss": 0.0002, "num_tokens": 2174018.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 136.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.012800322845578194, "kl": 0.15921640396118164, "learning_rate": 8.736666666666667e-07, "loss": 0.0079, "num_tokens": 2174328.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 136.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.047093044966459274, "kl": 0.001638702000491321, "learning_rate": 8.733333333333333e-07, "loss": 0.0001, "num_tokens": 2174561.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 136.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005203679669648409, "kl": 0.0008633090765215456, "learning_rate": 8.729999999999999e-07, "loss": 0.0, "num_tokens": 2174845.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 136.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03214145451784134, "kl": 0.006641737651079893, "learning_rate": 8.726666666666666e-07, "loss": 0.0003, "num_tokens": 2175135.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 136.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.6138439178466797, "kl": 0.013574820943176746, "learning_rate": 8.723333333333334e-07, "loss": -0.0353, "num_tokens": 2175432.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 136.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 2.073247194290161, "kl": 0.1852499432861805, "learning_rate": 8.720000000000001e-07, "loss": -0.0173, "num_tokens": 2175812.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 7385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 136.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01346014067530632, "kl": 0.0003483604086795822, "learning_rate": 8.716666666666668e-07, "loss": 0.0, "num_tokens": 2176055.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.000995326554402709, "kl": 0.003733530640602112, "learning_rate": 8.713333333333333e-07, "loss": 0.0002, "num_tokens": 2176291.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 136.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.011303563602268696, "kl": 0.007572616450488567, "learning_rate": 8.71e-07, "loss": 0.0004, "num_tokens": 2176563.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 136.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.698444366455078, "kl": 0.1807866357266903, "learning_rate": 8.706666666666667e-07, "loss": 0.0232, "num_tokens": 2176868.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 136.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04582621529698372, "kl": 0.003882407210767269, "learning_rate": 8.703333333333334e-07, "loss": 0.0002, "num_tokens": 2177180.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 136.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.020384741947054863, "kl": 0.004229044541716576, "learning_rate": 8.699999999999999e-07, "loss": 0.0002, "num_tokens": 2177436.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 136.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.7010905742645264, "kl": 0.11481644958257675, "learning_rate": 8.696666666666666e-07, "loss": 0.0059, "num_tokens": 2177814.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 7392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 136.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0655355155467987, "kl": 0.24870573729276657, "learning_rate": 8.693333333333334e-07, "loss": 0.0124, "num_tokens": 2178122.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 136.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.18719778954982758, "kl": 0.03866432886570692, "learning_rate": 8.690000000000001e-07, "loss": 0.0019, "num_tokens": 2178426.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 136.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0578639917075634, "kl": 0.04248492605984211, "learning_rate": 8.686666666666667e-07, "loss": 0.0021, "num_tokens": 2178830.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 136.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06568656861782074, "kl": 0.010178704280406237, "learning_rate": 8.683333333333334e-07, "loss": 0.0005, "num_tokens": 2179126.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 136.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.11169084161520004, "kl": 0.0135839837603271, "learning_rate": 8.68e-07, "loss": 0.0007, "num_tokens": 2179402.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 137.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.022460181266069412, "kl": 0.011400938034057617, "learning_rate": 8.676666666666667e-07, "loss": 0.0006, "num_tokens": 2179662.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7398 }, { "clip_ratio/high_max": 0.006666666828095913, "clip_ratio/high_mean": 0.006666666828095913, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006666666828095913, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 137.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.146008014678955, "kl": 0.2829881012439728, "learning_rate": 8.673333333333333e-07, "loss": -0.0069, "num_tokens": 2180036.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 137.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.028076032176613808, "kl": 0.0056443470530211926, "learning_rate": 8.669999999999999e-07, "loss": 0.0003, "num_tokens": 2180304.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 137.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.049363575875759125, "kl": 0.008579898159950972, "learning_rate": 8.666666666666666e-07, "loss": 0.0004, "num_tokens": 2180621.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 137.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05836309492588043, "kl": 0.002380305028054863, "learning_rate": 8.663333333333334e-07, "loss": 0.0001, "num_tokens": 2180889.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 137.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 5.664955139160156, "kl": 0.09833686100319028, "learning_rate": 8.66e-07, "loss": 0.1315, "num_tokens": 2181170.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 7403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 137.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 11.70896053314209, "kl": 0.5666388068348169, "learning_rate": 8.656666666666667e-07, "loss": -0.0912, "num_tokens": 2181387.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 7404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 137.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02353428490459919, "kl": 0.0005417026113718748, "learning_rate": 8.653333333333334e-07, "loss": 0.0, "num_tokens": 2181621.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 137.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.022645464166998863, "kl": 0.001115315710194409, "learning_rate": 8.65e-07, "loss": 0.0001, "num_tokens": 2181909.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 137.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.814314842224121, "kl": 0.23310255235992372, "learning_rate": 8.646666666666667e-07, "loss": 0.0369, "num_tokens": 2182235.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 137.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.032141633331775665, "kl": 0.0005283355712890625, "learning_rate": 8.643333333333333e-07, "loss": 0.0, "num_tokens": 2182455.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 137.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06343802064657211, "kl": 0.004680798389017582, "learning_rate": 8.64e-07, "loss": 0.0003, "num_tokens": 2182686.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 137.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 10.0105619430542, "kl": 0.1006287969648838, "learning_rate": 8.636666666666668e-07, "loss": 0.0639, "num_tokens": 2182963.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7410 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 137.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 5.662911891937256, "kl": 0.06964758038520813, "learning_rate": 8.633333333333335e-07, "loss": 0.2889, "num_tokens": 2183283.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 137.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.08836774528026581, "kl": 0.006171148270368576, "learning_rate": 8.63e-07, "loss": 0.0003, "num_tokens": 2183551.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 137.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.435682773590088, "kl": 0.022472238168120384, "learning_rate": 8.626666666666667e-07, "loss": -0.001, "num_tokens": 2183866.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 137.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.042500365525484085, "kl": 0.0037860737647861242, "learning_rate": 8.623333333333334e-07, "loss": 0.0002, "num_tokens": 2184160.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 137.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 1.332398533821106, "kl": 0.13493806798942387, "learning_rate": 8.62e-07, "loss": 0.0078, "num_tokens": 2184460.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 137.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.030837273225188255, "kl": 0.0015538225416094065, "learning_rate": 8.616666666666666e-07, "loss": 0.0001, "num_tokens": 2184787.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 137.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08426261693239212, "kl": 0.011116457171738148, "learning_rate": 8.613333333333333e-07, "loss": 0.0006, "num_tokens": 2185118.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 137.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.9193925857543945, "kl": 0.24017392098903656, "learning_rate": 8.61e-07, "loss": 0.0303, "num_tokens": 2185423.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 137.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006789442617446184, "kl": 0.16325239092111588, "learning_rate": 8.606666666666668e-07, "loss": 0.0082, "num_tokens": 2185731.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 137.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.006305330898612738, "kl": 0.0008604274480603635, "learning_rate": 8.603333333333335e-07, "loss": 0.0, "num_tokens": 2186015.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 137.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.0816707611083984, "kl": 0.06952119991183281, "learning_rate": 8.6e-07, "loss": 0.0824, "num_tokens": 2186366.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 7421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 137.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.1182108223438263, "kl": 0.039948973804712296, "learning_rate": 8.596666666666667e-07, "loss": 0.002, "num_tokens": 2186712.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 137.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01411399058997631, "kl": 0.0006649158895015717, "learning_rate": 8.593333333333334e-07, "loss": 0.0, "num_tokens": 2186972.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 137.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04029836878180504, "kl": 0.007934166118502617, "learning_rate": 8.590000000000001e-07, "loss": 0.0004, "num_tokens": 2187319.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 137.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.01258003432303667, "kl": 0.00032490704325027764, "learning_rate": 8.586666666666666e-07, "loss": 0.0, "num_tokens": 2187562.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 137.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03614702448248863, "kl": 0.0005190700394450687, "learning_rate": 8.583333333333333e-07, "loss": 0.0, "num_tokens": 2187818.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 137.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.8715689182281494, "kl": 0.1937999464571476, "learning_rate": 8.58e-07, "loss": 0.1032, "num_tokens": 2188100.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 137.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.05860782414674759, "kl": 0.007456639315932989, "learning_rate": 8.576666666666668e-07, "loss": 0.0004, "num_tokens": 2188360.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 137.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.20272482931613922, "kl": 0.026359963230788708, "learning_rate": 8.573333333333334e-07, "loss": 0.0014, "num_tokens": 2188663.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 137.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009349181782454252, "kl": 0.001314584689680487, "learning_rate": 8.57e-07, "loss": 0.0001, "num_tokens": 2188943.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 137.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.7454628944396973, "kl": 0.3071683496236801, "learning_rate": 8.566666666666667e-07, "loss": 0.0136, "num_tokens": 2189239.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 137.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.004477109760046005, "kl": 0.0004341356980148703, "learning_rate": 8.563333333333334e-07, "loss": 0.0, "num_tokens": 2189559.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 137.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09784694015979767, "kl": 0.025170376524329185, "learning_rate": 8.56e-07, "loss": 0.0014, "num_tokens": 2189824.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 137.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.004328091163188219, "kl": 0.00010947883129119873, "learning_rate": 8.556666666666666e-07, "loss": 0.0, "num_tokens": 2190036.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 137.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06888636946678162, "kl": 0.003335043787956238, "learning_rate": 8.553333333333333e-07, "loss": 0.0002, "num_tokens": 2190250.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 137.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0348845012485981, "kl": 0.029288400895893574, "learning_rate": 8.55e-07, "loss": 0.0015, "num_tokens": 2190602.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 137.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 9.342391014099121, "kl": 0.22621587512549013, "learning_rate": 8.546666666666667e-07, "loss": 0.0816, "num_tokens": 2190865.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 7437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 137.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.039388205856084824, "kl": 0.006763803539797664, "learning_rate": 8.543333333333334e-07, "loss": 0.0003, "num_tokens": 2191138.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 137.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.02858257293701172, "kl": 0.00419685392989777, "learning_rate": 8.540000000000001e-07, "loss": 0.0002, "num_tokens": 2191406.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 137.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.018522974103689194, "kl": 0.000549808144569397, "learning_rate": 8.536666666666667e-07, "loss": 0.0, "num_tokens": 2191618.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 137.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.022440942004323006, "kl": 0.09833554178476334, "learning_rate": 8.533333333333334e-07, "loss": 0.0049, "num_tokens": 2191991.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 137.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04527389258146286, "kl": 0.003962432034313679, "learning_rate": 8.53e-07, "loss": 0.0002, "num_tokens": 2192305.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 137.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.04913357272744179, "kl": 0.04473540745675564, "learning_rate": 8.526666666666666e-07, "loss": 0.0022, "num_tokens": 2192709.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 137.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008490979089401662, "kl": 0.0037662237882614136, "learning_rate": 8.523333333333333e-07, "loss": 0.0002, "num_tokens": 2192945.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 137.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 6.828980922698975, "kl": 0.041903593111783266, "learning_rate": 8.52e-07, "loss": 0.0698, "num_tokens": 2193251.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 137.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.9139034748077393, "kl": 0.16533025354146957, "learning_rate": 8.516666666666667e-07, "loss": 0.0101, "num_tokens": 2193617.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 7446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 137.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.08962789922952652, "kl": 0.007310988090466708, "learning_rate": 8.513333333333334e-07, "loss": 0.0004, "num_tokens": 2193895.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 137.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04856106638908386, "kl": 0.013421293813735247, "learning_rate": 8.510000000000001e-07, "loss": 0.0007, "num_tokens": 2194197.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 137.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 7.592230319976807, "kl": 1.1883927583694458, "learning_rate": 8.506666666666667e-07, "loss": 0.1217, "num_tokens": 2194498.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 7449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 137.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.023714344948530197, "kl": 0.0006568315147887915, "learning_rate": 8.503333333333333e-07, "loss": 0.0, "num_tokens": 2194762.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 137.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.11396137624979019, "kl": 0.01255968026816845, "learning_rate": 8.5e-07, "loss": 0.0007, "num_tokens": 2195050.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 138.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.006025221664458513, "kl": 0.0002475397413945757, "learning_rate": 8.496666666666667e-07, "loss": 0.0, "num_tokens": 2195359.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 138.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04535014182329178, "kl": 0.002231706981547177, "learning_rate": 8.493333333333333e-07, "loss": 0.0001, "num_tokens": 2195630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 138.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.029302461072802544, "kl": 0.028821095824241638, "learning_rate": 8.489999999999999e-07, "loss": 0.0014, "num_tokens": 2195998.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 138.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009031427907757461, "kl": 0.001305686600971967, "learning_rate": 8.486666666666667e-07, "loss": 0.0001, "num_tokens": 2196278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 138.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05860074982047081, "kl": 0.011074992828071117, "learning_rate": 8.483333333333334e-07, "loss": 0.0005, "num_tokens": 2196580.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 138.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.004128076136112213, "kl": 0.000924179214052856, "learning_rate": 8.480000000000001e-07, "loss": 0.0, "num_tokens": 2196864.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 138.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.072437286376953, "kl": 0.4014170467853546, "learning_rate": 8.476666666666668e-07, "loss": 0.0488, "num_tokens": 2197170.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 138.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.14057204127311707, "kl": 0.021488927770406008, "learning_rate": 8.473333333333333e-07, "loss": 0.0011, "num_tokens": 2197468.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 138.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04771888256072998, "kl": 0.007995693013072014, "learning_rate": 8.47e-07, "loss": 0.0004, "num_tokens": 2197752.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 138.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0066382321529090405, "kl": 0.00027922044682782143, "learning_rate": 8.466666666666667e-07, "loss": 0.0, "num_tokens": 2198024.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 138.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036018560640513897, "kl": 0.0002032928168773651, "learning_rate": 8.463333333333332e-07, "loss": 0.0, "num_tokens": 2198268.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 138.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02132377214729786, "kl": 0.0009703243558760732, "learning_rate": 8.459999999999999e-07, "loss": 0.0001, "num_tokens": 2198558.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 138.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.007851053029298782, "kl": 0.0007555115735158324, "learning_rate": 8.456666666666667e-07, "loss": 0.0, "num_tokens": 2198836.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 138.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.859008312225342, "kl": 0.1322932867333293, "learning_rate": 8.453333333333334e-07, "loss": -0.0719, "num_tokens": 2199154.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.06922364234924316, "kl": 0.0015659108757972717, "learning_rate": 8.450000000000001e-07, "loss": 0.0001, "num_tokens": 2199374.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 138.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.385145902633667, "kl": 0.07519279047846794, "learning_rate": 8.446666666666667e-07, "loss": 0.1105, "num_tokens": 2199713.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 7467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 138.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.14342519640922546, "kl": 0.019923364743590355, "learning_rate": 8.443333333333333e-07, "loss": 0.001, "num_tokens": 2199982.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 138.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01938001625239849, "kl": 0.0010086982365464792, "learning_rate": 8.44e-07, "loss": 0.0001, "num_tokens": 2200280.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 138.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.7041492462158203, "kl": 0.18741435185074806, "learning_rate": 8.436666666666667e-07, "loss": 0.0616, "num_tokens": 2200603.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 138.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.003147014183923602, "kl": 0.0002091715796268545, "learning_rate": 8.433333333333333e-07, "loss": 0.0, "num_tokens": 2200865.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 138.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.2989468574523926, "kl": 0.046004125848412514, "learning_rate": 8.430000000000001e-07, "loss": 0.0613, "num_tokens": 2201149.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 138.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 19.241243362426758, "kl": 0.03809886300587095, "learning_rate": 8.426666666666668e-07, "loss": 0.3625, "num_tokens": 2201373.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 7473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 138.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 4.816703796386719, "kl": 0.031998677644878626, "learning_rate": 8.423333333333334e-07, "loss": 0.0026, "num_tokens": 2201649.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 138.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.004953030962496996, "kl": 0.0004164865240454674, "learning_rate": 8.42e-07, "loss": 0.0, "num_tokens": 2201971.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 6.697404861450195, "kl": 0.006802628748118877, "learning_rate": 8.416666666666667e-07, "loss": 0.4424, "num_tokens": 2202244.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 138.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.037498679012060165, "kl": 0.006209843559190631, "learning_rate": 8.413333333333333e-07, "loss": 0.0003, "num_tokens": 2202582.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007692615035921335, "kl": 0.003784686326980591, "learning_rate": 8.41e-07, "loss": 0.0002, "num_tokens": 2202818.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 138.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.08540654927492142, "kl": 0.011341342236846685, "learning_rate": 8.406666666666667e-07, "loss": 0.0006, "num_tokens": 2203107.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 138.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.1784486770629883, "kl": 0.07143725268542767, "learning_rate": 8.403333333333333e-07, "loss": -0.03, "num_tokens": 2203484.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 138.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.930631637573242, "kl": 0.027482120785862207, "learning_rate": 8.400000000000001e-07, "loss": 0.0834, "num_tokens": 2203812.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 138.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02112310752272606, "kl": 0.005166828632354736, "learning_rate": 8.396666666666668e-07, "loss": 0.0003, "num_tokens": 2204084.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 138.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.044708251953125, "kl": 0.008042186964303255, "learning_rate": 8.393333333333334e-07, "loss": 0.0004, "num_tokens": 2204402.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 138.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.011828447692096233, "kl": 0.0004938519632560201, "learning_rate": 8.39e-07, "loss": 0.0, "num_tokens": 2204713.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 138.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.0983519554138184, "kl": 0.30614741519093513, "learning_rate": 8.386666666666667e-07, "loss": -0.0641, "num_tokens": 2205029.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 1.9500738382339478, "kl": 0.30372533947229385, "learning_rate": 8.383333333333334e-07, "loss": 0.0152, "num_tokens": 2205283.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 138.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.025342263281345367, "kl": 0.0042840738606173545, "learning_rate": 8.38e-07, "loss": 0.0002, "num_tokens": 2205551.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 138.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0386507622897625, "kl": 0.013293697265908122, "learning_rate": 8.376666666666666e-07, "loss": 0.0007, "num_tokens": 2205825.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 138.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.2004091888666153, "kl": 0.01904579158872366, "learning_rate": 8.373333333333333e-07, "loss": 0.0011, "num_tokens": 2206095.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 138.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.3351521492004395, "kl": 0.08389908447861671, "learning_rate": 8.370000000000001e-07, "loss": -0.0351, "num_tokens": 2206410.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 7490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08179929107427597, "kl": 0.004590657539665699, "learning_rate": 8.366666666666668e-07, "loss": 0.0002, "num_tokens": 2206623.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 138.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06452228128910065, "kl": 0.019506637006998062, "learning_rate": 8.363333333333333e-07, "loss": 0.001, "num_tokens": 2206923.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 138.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0369425043463707, "kl": 0.04019300080835819, "learning_rate": 8.36e-07, "loss": 0.002, "num_tokens": 2207328.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 138.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.011346984654664993, "kl": 0.1592487022280693, "learning_rate": 8.356666666666667e-07, "loss": 0.008, "num_tokens": 2207638.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 138.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.054627083241939545, "kl": 0.009328438900411129, "learning_rate": 8.353333333333334e-07, "loss": 0.0004, "num_tokens": 2207931.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 138.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.005104490090161562, "kl": 0.00032470822043251246, "learning_rate": 8.349999999999999e-07, "loss": 0.0, "num_tokens": 2208151.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 138.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.023571014404296875, "kl": 0.011389652732759714, "learning_rate": 8.346666666666666e-07, "loss": 0.0006, "num_tokens": 2208411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 138.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04267065227031708, "kl": 0.0036842571571469307, "learning_rate": 8.343333333333333e-07, "loss": 0.0002, "num_tokens": 2208723.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 138.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.023598650470376015, "kl": 0.09938794374465942, "learning_rate": 8.340000000000001e-07, "loss": 0.005, "num_tokens": 2209095.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 138.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.014311257749795914, "kl": 0.0009229793504346162, "learning_rate": 8.336666666666668e-07, "loss": 0.0, "num_tokens": 2209330.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 138.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.011054381728172302, "kl": 0.00014528334213537164, "learning_rate": 8.333333333333334e-07, "loss": 0.0, "num_tokens": 2209586.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 138.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.014404000714421272, "kl": 0.002331082767341286, "learning_rate": 8.33e-07, "loss": 0.0001, "num_tokens": 2209918.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030474516097456217, "kl": 3.0584633350372314e-05, "learning_rate": 8.326666666666667e-07, "loss": 0.0, "num_tokens": 2210130.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7503 }, { "clip_ratio/high_max": 0.006756756920367479, "clip_ratio/high_mean": 0.006756756920367479, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006756756920367479, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 138.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.5553717613220215, "kl": 0.061494626104831696, "learning_rate": 8.323333333333334e-07, "loss": -0.1179, "num_tokens": 2210488.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 7504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007042253389954567, "clip_ratio/low_min": 0.007042253389954567, "clip_ratio/region_mean": 0.007042253389954567, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 138.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.1578540802001953, "kl": 0.11879325658082962, "learning_rate": 8.319999999999999e-07, "loss": -0.0319, "num_tokens": 2210829.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 139.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.015871938318014145, "kl": 0.0006293095648288727, "learning_rate": 8.316666666666666e-07, "loss": 0.0, "num_tokens": 2211089.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 139.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.012658831663429737, "kl": 0.0005571305810008198, "learning_rate": 8.313333333333333e-07, "loss": 0.0, "num_tokens": 2211406.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 139.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06424162536859512, "kl": 0.00840391730889678, "learning_rate": 8.310000000000001e-07, "loss": 0.0004, "num_tokens": 2211679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 139.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.026806170120835304, "kl": 0.0011744549265131354, "learning_rate": 8.306666666666667e-07, "loss": 0.0001, "num_tokens": 2211895.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 139.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.1453925520181656, "kl": 0.03526879474520683, "learning_rate": 8.303333333333334e-07, "loss": 0.0014, "num_tokens": 2212212.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 139.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.057810500264167786, "kl": 0.0025528251426294446, "learning_rate": 8.3e-07, "loss": 0.0001, "num_tokens": 2212478.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 139.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.016564732417464256, "kl": 0.1574014574289322, "learning_rate": 8.296666666666667e-07, "loss": 0.0079, "num_tokens": 2212789.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 139.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0217976626008749, "kl": 0.008066430920735002, "learning_rate": 8.293333333333333e-07, "loss": 0.0004, "num_tokens": 2213079.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 139.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.040881071239709854, "kl": 0.011039676610380411, "learning_rate": 8.29e-07, "loss": 0.0006, "num_tokens": 2213422.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 139.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.11250057816505432, "kl": 0.019668866880238056, "learning_rate": 8.286666666666666e-07, "loss": 0.001, "num_tokens": 2213700.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 139.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.021708892658352852, "kl": 0.00024116988060995936, "learning_rate": 8.283333333333333e-07, "loss": 0.0, "num_tokens": 2213957.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 139.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.007992224767804146, "kl": 0.0003065453201998025, "learning_rate": 8.280000000000001e-07, "loss": 0.0, "num_tokens": 2214225.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 139.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0388016402721405, "kl": 0.0016274884110316634, "learning_rate": 8.276666666666667e-07, "loss": 0.0001, "num_tokens": 2214495.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 139.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06862416118383408, "kl": 0.014306346885859966, "learning_rate": 8.273333333333334e-07, "loss": 0.0007, "num_tokens": 2214824.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 139.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.8594709634780884, "kl": 0.11305754183558747, "learning_rate": 8.270000000000001e-07, "loss": 0.0059, "num_tokens": 2215085.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 139.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.897463858127594, "kl": 0.07248111662920564, "learning_rate": 8.266666666666667e-07, "loss": 0.0042, "num_tokens": 2215381.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 139.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07356895506381989, "kl": 0.014403363689780235, "learning_rate": 8.263333333333333e-07, "loss": 0.0007, "num_tokens": 2215663.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 139.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.010495007038116455, "kl": 0.002083301544189453, "learning_rate": 8.26e-07, "loss": 0.0001, "num_tokens": 2215879.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 139.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.00595331285148859, "kl": 0.0004487338737817481, "learning_rate": 8.256666666666666e-07, "loss": 0.0, "num_tokens": 2216198.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 139.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.021645324304699898, "kl": 0.005602358374744654, "learning_rate": 8.253333333333333e-07, "loss": 0.0003, "num_tokens": 2216466.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 139.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017823046073317528, "kl": 3.83034348487854e-05, "learning_rate": 8.25e-07, "loss": 0.0, "num_tokens": 2216678.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 139.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.866249084472656, "kl": 0.054738983511924744, "learning_rate": 8.246666666666667e-07, "loss": 0.1679, "num_tokens": 2216983.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 7527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 139.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 2.607938528060913, "kl": 0.10753445327281952, "learning_rate": 8.243333333333334e-07, "loss": 0.0064, "num_tokens": 2217335.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 7528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 139.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07217083871364594, "kl": 0.0256509892642498, "learning_rate": 8.240000000000001e-07, "loss": 0.0014, "num_tokens": 2217646.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 139.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035971191246062517, "kl": 9.822547508520074e-05, "learning_rate": 8.236666666666666e-07, "loss": 0.0, "num_tokens": 2217906.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 139.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06426176428794861, "kl": 0.012222326826304197, "learning_rate": 8.233333333333333e-07, "loss": 0.0006, "num_tokens": 2218210.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 139.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.294781446456909, "kl": 0.26684870198369026, "learning_rate": 8.23e-07, "loss": 0.0144, "num_tokens": 2218586.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 7532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 139.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.09070924669504166, "kl": 0.004909512703306973, "learning_rate": 8.226666666666667e-07, "loss": 0.0002, "num_tokens": 2218882.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 139.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04229502007365227, "kl": 0.037462251260876656, "learning_rate": 8.223333333333335e-07, "loss": 0.0019, "num_tokens": 2219287.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 139.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.072139263153076, "kl": 0.48522141203284264, "learning_rate": 8.220000000000001e-07, "loss": -0.045, "num_tokens": 2219583.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 139.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.04239751026034355, "kl": 0.009944356512278318, "learning_rate": 8.216666666666667e-07, "loss": 0.0005, "num_tokens": 2219867.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 139.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.610398769378662, "kl": 0.02635159925557673, "learning_rate": 8.213333333333334e-07, "loss": 0.0345, "num_tokens": 2220198.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 139.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.036886066198349, "kl": 0.014997601509094238, "learning_rate": 8.210000000000001e-07, "loss": 0.0007, "num_tokens": 2220494.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 139.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.016432078555226326, "kl": 0.0017473031766712666, "learning_rate": 8.206666666666666e-07, "loss": 0.0001, "num_tokens": 2220771.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 139.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.044504694640636444, "kl": 0.006012620055116713, "learning_rate": 8.203333333333333e-07, "loss": 0.0003, "num_tokens": 2221029.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 139.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.064578056335449, "kl": 0.10834954772144556, "learning_rate": 8.2e-07, "loss": 0.1237, "num_tokens": 2221314.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 7541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 139.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03416510671377182, "kl": 0.000925898551940918, "learning_rate": 8.196666666666667e-07, "loss": 0.0, "num_tokens": 2221524.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 139.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03329979628324509, "kl": 0.0008929375326260924, "learning_rate": 8.193333333333334e-07, "loss": 0.0, "num_tokens": 2221800.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 139.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03190172091126442, "kl": 0.007421887246891856, "learning_rate": 8.190000000000001e-07, "loss": 0.0003, "num_tokens": 2222092.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 139.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.016833335161209106, "kl": 0.00046781414130236953, "learning_rate": 8.186666666666667e-07, "loss": 0.0, "num_tokens": 2222325.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 139.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069707660004496574, "kl": 0.0012512215180322528, "learning_rate": 8.183333333333334e-07, "loss": 0.0001, "num_tokens": 2222607.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 139.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.9212307929992676, "kl": 0.09565270692110062, "learning_rate": 8.18e-07, "loss": 0.0923, "num_tokens": 2222991.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 139.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.004899625200778246, "kl": 0.000315505254548043, "learning_rate": 8.176666666666667e-07, "loss": 0.0, "num_tokens": 2223211.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 139.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034213343169540167, "kl": 0.00029207393527030945, "learning_rate": 8.173333333333333e-07, "loss": 0.0, "num_tokens": 2223471.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 139.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0565950945019722, "kl": 0.04421113058924675, "learning_rate": 8.17e-07, "loss": 0.0022, "num_tokens": 2223799.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 139.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007374148699454963, "kl": 0.0037840083241462708, "learning_rate": 8.166666666666666e-07, "loss": 0.0002, "num_tokens": 2224035.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 139.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 5.976224899291992, "kl": 1.206072598695755, "learning_rate": 8.163333333333334e-07, "loss": 0.0572, "num_tokens": 2224338.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 139.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.043496474623680115, "kl": 0.10060855746269226, "learning_rate": 8.160000000000001e-07, "loss": 0.005, "num_tokens": 2224710.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 139.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.12133745849132538, "kl": 0.013637838419526815, "learning_rate": 8.156666666666667e-07, "loss": 0.0007, "num_tokens": 2225014.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 139.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.34241989254951477, "kl": 0.0259231049567461, "learning_rate": 8.153333333333334e-07, "loss": 0.0015, "num_tokens": 2225261.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 139.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02494633011519909, "kl": 0.010982351377606392, "learning_rate": 8.15e-07, "loss": 0.0005, "num_tokens": 2225521.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 139.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.6630780696868896, "kl": 0.19948132801800966, "learning_rate": 8.146666666666667e-07, "loss": 0.0239, "num_tokens": 2225834.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 7557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 139.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004337610735092312, "kl": 1.1578202247619629e-05, "learning_rate": 8.143333333333333e-07, "loss": 0.0, "num_tokens": 2226054.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 139.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03504602238535881, "kl": 0.005426311166957021, "learning_rate": 8.14e-07, "loss": 0.0003, "num_tokens": 2226386.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 140.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04597458243370056, "kl": 0.04685534071177244, "learning_rate": 8.136666666666666e-07, "loss": 0.0023, "num_tokens": 2226756.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 140.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04793095588684082, "kl": 0.00395410624332726, "learning_rate": 8.133333333333334e-07, "loss": 0.0002, "num_tokens": 2227060.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 140.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02366197481751442, "kl": 0.09934138134121895, "learning_rate": 8.130000000000001e-07, "loss": 0.005, "num_tokens": 2227432.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 140.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.07069484144449234, "kl": 0.04000316001474857, "learning_rate": 8.126666666666668e-07, "loss": 0.002, "num_tokens": 2227771.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 140.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.030317479744553566, "kl": 0.001375596970319748, "learning_rate": 8.123333333333333e-07, "loss": 0.0001, "num_tokens": 2228039.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 140.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04880096763372421, "kl": 0.012494937982410192, "learning_rate": 8.12e-07, "loss": 0.0006, "num_tokens": 2228300.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 140.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.978727340698242, "kl": 0.18742307275533676, "learning_rate": 8.116666666666667e-07, "loss": -0.0235, "num_tokens": 2228608.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 140.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.006790049374103546, "kl": 0.00026611237262841314, "learning_rate": 8.113333333333333e-07, "loss": 0.0, "num_tokens": 2228880.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 140.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.7163357734680176, "kl": 0.18649326637387276, "learning_rate": 8.109999999999999e-07, "loss": 0.0559, "num_tokens": 2229221.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 7568 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 140.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 4.810575008392334, "kl": 0.07260394468903542, "learning_rate": 8.106666666666666e-07, "loss": -0.1203, "num_tokens": 2229546.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 140.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.19880366325378418, "kl": 0.028160014539025724, "learning_rate": 8.103333333333334e-07, "loss": 0.002, "num_tokens": 2229843.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 140.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.24680061638355255, "kl": 0.07979537546634674, "learning_rate": 8.100000000000001e-07, "loss": 0.0037, "num_tokens": 2230159.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 140.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 1.1867517232894897, "kl": 0.11407772451639175, "learning_rate": 8.096666666666668e-07, "loss": 0.006, "num_tokens": 2230420.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 140.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.029743794351816177, "kl": 0.0008981935679912567, "learning_rate": 8.093333333333333e-07, "loss": 0.0, "num_tokens": 2230664.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 140.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.03845464065670967, "kl": 0.0340463537722826, "learning_rate": 8.09e-07, "loss": 0.0018, "num_tokens": 2231034.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 140.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.029919151216745377, "kl": 0.003957441025704611, "learning_rate": 8.086666666666667e-07, "loss": 0.0002, "num_tokens": 2231292.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 140.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.4280803203582764, "kl": 0.47196139581501484, "learning_rate": 8.083333333333334e-07, "loss": 0.0234, "num_tokens": 2231614.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 140.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.046749360859394073, "kl": 0.006110889138653874, "learning_rate": 8.079999999999999e-07, "loss": 0.0003, "num_tokens": 2231945.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 140.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.08680707216262817, "kl": 0.007161391666159034, "learning_rate": 8.076666666666666e-07, "loss": 0.0004, "num_tokens": 2232277.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 140.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.4702816009521484, "kl": 0.5615438558161259, "learning_rate": 8.073333333333334e-07, "loss": -0.0502, "num_tokens": 2232636.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 140.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007369700470007956, "kl": 0.0037825629115104675, "learning_rate": 8.070000000000001e-07, "loss": 0.0002, "num_tokens": 2232872.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 140.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.1824977695941925, "kl": 0.027511563152074814, "learning_rate": 8.066666666666667e-07, "loss": 0.0014, "num_tokens": 2233158.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 140.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03374043479561806, "kl": 0.04851582646369934, "learning_rate": 8.063333333333333e-07, "loss": 0.0024, "num_tokens": 2233562.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 140.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.005906059872359037, "kl": 0.0006824582815170288, "learning_rate": 8.06e-07, "loss": 0.0, "num_tokens": 2233778.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 140.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008622748428024352, "kl": 0.0012761758989654481, "learning_rate": 8.056666666666667e-07, "loss": 0.0001, "num_tokens": 2234058.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 140.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.029156355187296867, "kl": 0.0010121912637259811, "learning_rate": 8.053333333333333e-07, "loss": 0.0001, "num_tokens": 2234388.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 140.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011493833735585213, "kl": 0.002943081548437476, "learning_rate": 8.049999999999999e-07, "loss": 0.0001, "num_tokens": 2234679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 76.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 140.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.269927740097046, "kl": 0.2345767617225647, "learning_rate": 8.046666666666666e-07, "loss": 0.453, "num_tokens": 2235223.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 140.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 1.0193402767181396, "kl": 0.05985639221034944, "learning_rate": 8.043333333333334e-07, "loss": 0.0034, "num_tokens": 2235499.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 140.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337231695652008, "kl": 0.02417577523738146, "learning_rate": 8.04e-07, "loss": 0.0014, "num_tokens": 2235781.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 140.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.053484175354242325, "kl": 0.028386572375893593, "learning_rate": 8.036666666666667e-07, "loss": 0.0013, "num_tokens": 2236083.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 140.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.021800415590405464, "kl": 0.0011058914533350617, "learning_rate": 8.033333333333334e-07, "loss": 0.0001, "num_tokens": 2236359.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 140.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.07727114111185074, "kl": 0.004364959895610809, "learning_rate": 8.03e-07, "loss": 0.0002, "num_tokens": 2236570.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 140.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.18696685135364532, "kl": 0.020725714042782784, "learning_rate": 8.026666666666667e-07, "loss": 0.001, "num_tokens": 2236852.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 140.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.028688820078969002, "kl": 0.0038562872214242816, "learning_rate": 8.023333333333333e-07, "loss": 0.0002, "num_tokens": 2237140.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 140.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.17888464033603668, "kl": 0.008969725575298071, "learning_rate": 8.02e-07, "loss": 0.0003, "num_tokens": 2237388.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 140.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.1041543036699295, "kl": 0.0071145216934382915, "learning_rate": 8.016666666666668e-07, "loss": 0.0005, "num_tokens": 2237615.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 140.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.15359792113304138, "kl": 0.009021283593028784, "learning_rate": 8.013333333333335e-07, "loss": 0.0005, "num_tokens": 2237873.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 140.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.030205674469470978, "kl": 0.0047327810898423195, "learning_rate": 8.01e-07, "loss": 0.0002, "num_tokens": 2238161.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 140.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.029939625412225723, "kl": 0.00027085840702056885, "learning_rate": 8.006666666666667e-07, "loss": 0.0, "num_tokens": 2238373.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 140.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.6898300647735596, "kl": 0.3640999048948288, "learning_rate": 8.003333333333334e-07, "loss": 0.0106, "num_tokens": 2238746.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 7600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 140.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04006919637322426, "kl": 0.008541662245988846, "learning_rate": 8e-07, "loss": 0.0004, "num_tokens": 2239105.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 140.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01660209149122238, "kl": 0.00018564164929557592, "learning_rate": 7.996666666666666e-07, "loss": 0.0, "num_tokens": 2239361.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 140.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.30509015917778015, "kl": 0.04901084862649441, "learning_rate": 7.993333333333333e-07, "loss": 0.0023, "num_tokens": 2239655.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 140.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04473618417978287, "kl": 0.0024844787549227476, "learning_rate": 7.99e-07, "loss": 0.0001, "num_tokens": 2239951.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 140.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.00043820179416798055, "kl": 1.1593103408813477e-05, "learning_rate": 7.986666666666668e-07, "loss": 0.0, "num_tokens": 2240171.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 140.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.17294429242610931, "kl": 0.033557578921318054, "learning_rate": 7.983333333333335e-07, "loss": 0.0018, "num_tokens": 2240482.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 140.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.014695012010633945, "kl": 0.0003977760788984597, "learning_rate": 7.98e-07, "loss": 0.0, "num_tokens": 2240798.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 140.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.15435448288917542, "kl": 0.025543692521750927, "learning_rate": 7.976666666666667e-07, "loss": 0.0013, "num_tokens": 2241069.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 140.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.004698130302131176, "kl": 0.00029430389986373484, "learning_rate": 7.973333333333334e-07, "loss": 0.0, "num_tokens": 2241289.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 140.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06369496881961823, "kl": 0.008858399465680122, "learning_rate": 7.970000000000001e-07, "loss": 0.0004, "num_tokens": 2241601.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 140.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.011792807839810848, "kl": 0.0022514045267598704, "learning_rate": 7.966666666666666e-07, "loss": 0.0001, "num_tokens": 2241867.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 140.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02483171783387661, "kl": 0.006977959303185344, "learning_rate": 7.963333333333333e-07, "loss": 0.0003, "num_tokens": 2242156.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 140.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 11.277273178100586, "kl": 0.03083835239522159, "learning_rate": 7.96e-07, "loss": 0.0367, "num_tokens": 2242390.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 7613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04749931022524834, "kl": 0.02056124061346054, "learning_rate": 7.956666666666668e-07, "loss": 0.0011, "num_tokens": 2242670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 141.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.018423404544591904, "kl": 0.0005905196012463421, "learning_rate": 7.953333333333334e-07, "loss": 0.0, "num_tokens": 2242985.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 92.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 92.75, "completions/mean_terminated_length": 38.333335876464844, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 141.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.6382052898406982, "kl": 0.1818025279790163, "learning_rate": 7.95e-07, "loss": 0.3945, "num_tokens": 2243572.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 7616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 141.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.09395188838243484, "kl": 0.0018283475656062365, "learning_rate": 7.946666666666667e-07, "loss": 0.0001, "num_tokens": 2243792.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 141.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.01620199717581272, "kl": 0.0007060617208480835, "learning_rate": 7.943333333333334e-07, "loss": 0.0, "num_tokens": 2244052.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 141.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03634077310562134, "kl": 0.0008541723364032805, "learning_rate": 7.94e-07, "loss": 0.0, "num_tokens": 2244293.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 141.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02382316254079342, "kl": 0.011331932619214058, "learning_rate": 7.936666666666666e-07, "loss": 0.0006, "num_tokens": 2244553.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 141.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02722771093249321, "kl": 0.00026547908782958984, "learning_rate": 7.933333333333333e-07, "loss": 0.0, "num_tokens": 2244809.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 141.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008668236434459686, "kl": 0.0005737521569244564, "learning_rate": 7.93e-07, "loss": 0.0, "num_tokens": 2245044.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 141.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.023655198514461517, "kl": 0.0043696698267012835, "learning_rate": 7.926666666666668e-07, "loss": 0.0002, "num_tokens": 2245363.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 141.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04898698255419731, "kl": 0.007389507722109556, "learning_rate": 7.923333333333334e-07, "loss": 0.0004, "num_tokens": 2245647.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 141.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.18285970389842987, "kl": 0.1745816171169281, "learning_rate": 7.920000000000001e-07, "loss": 0.0088, "num_tokens": 2245962.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 141.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.1095123142004013, "kl": 0.021343120373785496, "learning_rate": 7.916666666666667e-07, "loss": 0.0012, "num_tokens": 2246246.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.7966949939727783, "kl": 0.03962932527065277, "learning_rate": 7.913333333333334e-07, "loss": 0.0948, "num_tokens": 2246523.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 141.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.36198094487190247, "kl": 0.06072469800710678, "learning_rate": 7.91e-07, "loss": 0.0028, "num_tokens": 2246822.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.3703181743621826, "kl": 0.210733522195369, "learning_rate": 7.906666666666666e-07, "loss": -0.0461, "num_tokens": 2247107.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 141.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.765426158905029, "kl": 0.03463748004287481, "learning_rate": 7.903333333333333e-07, "loss": 0.0142, "num_tokens": 2247412.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 141.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004028107039630413, "kl": 0.00026410221471451223, "learning_rate": 7.9e-07, "loss": 0.0, "num_tokens": 2247632.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 141.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0638107880949974, "kl": 0.008552071638405323, "learning_rate": 7.896666666666667e-07, "loss": 0.0004, "num_tokens": 2247944.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 141.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06308624148368835, "kl": 0.004701980855315924, "learning_rate": 7.893333333333334e-07, "loss": 0.0002, "num_tokens": 2248198.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 141.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03033612109720707, "kl": 0.0047260463470593095, "learning_rate": 7.890000000000001e-07, "loss": 0.0002, "num_tokens": 2248527.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02245710790157318, "kl": 0.002423997357254848, "learning_rate": 7.886666666666667e-07, "loss": 0.0001, "num_tokens": 2248797.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 141.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03912690281867981, "kl": 0.0017813832382671535, "learning_rate": 7.883333333333333e-07, "loss": 0.0001, "num_tokens": 2249067.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 141.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.026895679533481598, "kl": 0.0020042358519276604, "learning_rate": 7.88e-07, "loss": 0.0001, "num_tokens": 2249357.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 141.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.07445089519023895, "kl": 0.011246025562286377, "learning_rate": 7.876666666666667e-07, "loss": 0.0006, "num_tokens": 2249685.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 141.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.17501963675022125, "kl": 0.025519472546875477, "learning_rate": 7.873333333333333e-07, "loss": 0.0014, "num_tokens": 2250027.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 141.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008800389245152473, "kl": 0.09812876582145691, "learning_rate": 7.869999999999999e-07, "loss": 0.0049, "num_tokens": 2250399.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 141.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.09145799279212952, "kl": 0.0044666125904768705, "learning_rate": 7.866666666666667e-07, "loss": 0.0002, "num_tokens": 2250653.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.026417864486575127, "kl": 0.0023253896506503224, "learning_rate": 7.863333333333334e-07, "loss": 0.0001, "num_tokens": 2250930.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 141.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.20951484143733978, "kl": 0.08382711559534073, "learning_rate": 7.860000000000001e-07, "loss": 0.0042, "num_tokens": 2251333.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.009331641718745232, "kl": 0.00131767155835405, "learning_rate": 7.856666666666666e-07, "loss": 0.0001, "num_tokens": 2251615.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 141.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11620497703552246, "kl": 0.029218826442956924, "learning_rate": 7.853333333333333e-07, "loss": 0.0015, "num_tokens": 2251926.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 141.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.1253441423177719, "kl": 0.013156628585420549, "learning_rate": 7.85e-07, "loss": 0.0007, "num_tokens": 2252256.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 141.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.1754655838012695, "kl": 0.05695727467536926, "learning_rate": 7.846666666666667e-07, "loss": 0.0295, "num_tokens": 2252554.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 141.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.01952867954969406, "kl": 0.0005751699209213257, "learning_rate": 7.843333333333332e-07, "loss": 0.0, "num_tokens": 2252760.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.051282916218042374, "kl": 0.004241932416334748, "learning_rate": 7.839999999999999e-07, "loss": 0.0002, "num_tokens": 2253033.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 141.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.00404238561168313, "kl": 0.00012672245065914467, "learning_rate": 7.836666666666667e-07, "loss": 0.0, "num_tokens": 2253293.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 141.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.683503150939941, "kl": 0.09627540200017393, "learning_rate": 7.833333333333334e-07, "loss": 0.1975, "num_tokens": 2253621.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 141.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10470376163721085, "kl": 0.017083127051591873, "learning_rate": 7.830000000000001e-07, "loss": 0.0009, "num_tokens": 2253901.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 141.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.12036968767642975, "kl": 0.003055006265640259, "learning_rate": 7.826666666666667e-07, "loss": 0.0001, "num_tokens": 2254114.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 141.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 7.06951379776001, "kl": 0.04319071210920811, "learning_rate": 7.823333333333333e-07, "loss": 0.0417, "num_tokens": 2254390.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 141.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.09180354326963425, "kl": 0.016905405558645725, "learning_rate": 7.82e-07, "loss": 0.0009, "num_tokens": 2254719.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 141.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03564859554171562, "kl": 0.0022495443117804825, "learning_rate": 7.816666666666667e-07, "loss": 0.0001, "num_tokens": 2254991.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 141.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.12955109775066376, "kl": 0.04939829558134079, "learning_rate": 7.813333333333332e-07, "loss": 0.0025, "num_tokens": 2255330.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 141.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.3093033730983734, "kl": 0.026199544459814206, "learning_rate": 7.810000000000001e-07, "loss": 0.0016, "num_tokens": 2255599.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.05505216121673584, "kl": 0.004061909057782032, "learning_rate": 7.806666666666668e-07, "loss": 0.0002, "num_tokens": 2255897.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 141.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1282864362001419, "kl": 0.058737581595778465, "learning_rate": 7.803333333333334e-07, "loss": 0.003, "num_tokens": 2256270.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 141.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007894549635238945, "kl": 0.0037700235843658447, "learning_rate": 7.8e-07, "loss": 0.0002, "num_tokens": 2256506.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 141.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.7861123085021973, "kl": 0.2590033560991287, "learning_rate": 7.796666666666667e-07, "loss": 0.1184, "num_tokens": 2256867.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 141.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 4.3026018142700195, "kl": 0.20893632620573044, "learning_rate": 7.793333333333333e-07, "loss": 0.1137, "num_tokens": 2257244.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 7663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 141.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02406764030456543, "kl": 0.2635874003171921, "learning_rate": 7.79e-07, "loss": 0.0132, "num_tokens": 2257549.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 141.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.16898702085018158, "kl": 0.01364335953257978, "learning_rate": 7.786666666666667e-07, "loss": 0.0006, "num_tokens": 2257847.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 141.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.303480625152588, "kl": 0.1552797630429268, "learning_rate": 7.783333333333333e-07, "loss": 0.0088, "num_tokens": 2258101.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 7666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 141.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006510845851153135, "kl": 2.022087574005127e-05, "learning_rate": 7.780000000000001e-07, "loss": 0.0, "num_tokens": 2258321.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09836862981319427, "kl": 0.011081829317845404, "learning_rate": 7.776666666666668e-07, "loss": 0.0006, "num_tokens": 2258623.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 142.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.3293292224407196, "kl": 0.03504626452922821, "learning_rate": 7.773333333333334e-07, "loss": 0.0018, "num_tokens": 2258934.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 142.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009963270276784897, "kl": 0.26697002351284027, "learning_rate": 7.77e-07, "loss": 0.0133, "num_tokens": 2259238.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 142.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.10451026260852814, "kl": 0.024246810004115105, "learning_rate": 7.766666666666667e-07, "loss": 0.0012, "num_tokens": 2259532.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 142.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.19686958193778992, "kl": 0.015576696721836925, "learning_rate": 7.763333333333334e-07, "loss": 0.0007, "num_tokens": 2259813.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 142.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.05493713542819023, "kl": 0.002116822579409927, "learning_rate": 7.76e-07, "loss": 0.0001, "num_tokens": 2260077.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 142.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.19821488857269287, "kl": 0.04400704731233418, "learning_rate": 7.756666666666666e-07, "loss": 0.0014, "num_tokens": 2260431.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 142.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.013925421983003616, "kl": 0.00039866415318101645, "learning_rate": 7.753333333333333e-07, "loss": 0.0, "num_tokens": 2260747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 142.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.030978256836533546, "kl": 0.1637900248169899, "learning_rate": 7.750000000000001e-07, "loss": 0.0082, "num_tokens": 2261056.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 142.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.059655290096998215, "kl": 0.0076184822246432304, "learning_rate": 7.746666666666668e-07, "loss": 0.0004, "num_tokens": 2261368.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0249209962785244, "kl": 0.004268008982762694, "learning_rate": 7.743333333333333e-07, "loss": 0.0002, "num_tokens": 2261666.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 142.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.022615468129515648, "kl": 0.002672118949703872, "learning_rate": 7.74e-07, "loss": 0.0001, "num_tokens": 2261992.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005763848894275725, "kl": 1.6361474990844727e-05, "learning_rate": 7.736666666666667e-07, "loss": 0.0, "num_tokens": 2262212.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 142.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.004928158596158028, "kl": 0.00047995930071920156, "learning_rate": 7.733333333333334e-07, "loss": 0.0, "num_tokens": 2262446.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 142.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.021443970501422882, "kl": 0.0014528706087730825, "learning_rate": 7.729999999999999e-07, "loss": 0.0001, "num_tokens": 2262734.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 142.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.003393648425117135, "kl": 0.00034332985524088144, "learning_rate": 7.726666666666666e-07, "loss": 0.0, "num_tokens": 2263051.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 11.89183235168457, "kl": 0.16192296892404556, "learning_rate": 7.723333333333333e-07, "loss": 0.1482, "num_tokens": 2263270.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 142.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.027072906494140625, "kl": 0.00036369860026752576, "learning_rate": 7.720000000000001e-07, "loss": 0.0, "num_tokens": 2263526.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03711475804448128, "kl": 0.000505167234223336, "learning_rate": 7.716666666666668e-07, "loss": 0.0, "num_tokens": 2263739.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 142.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.27470842003822327, "kl": 0.08565065264701843, "learning_rate": 7.713333333333334e-07, "loss": 0.0044, "num_tokens": 2264075.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 142.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03703993186354637, "kl": 0.03506651986390352, "learning_rate": 7.71e-07, "loss": 0.0018, "num_tokens": 2264444.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 142.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.019775282591581345, "kl": 0.0018073072278639302, "learning_rate": 7.706666666666667e-07, "loss": 0.0001, "num_tokens": 2264714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 142.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04220382869243622, "kl": 0.011459505651146173, "learning_rate": 7.703333333333334e-07, "loss": 0.0006, "num_tokens": 2265016.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 142.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.013074828311800957, "kl": 0.0005056522786617279, "learning_rate": 7.699999999999999e-07, "loss": 0.0, "num_tokens": 2265276.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 142.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.2924010455608368, "kl": 0.038424568716436625, "learning_rate": 7.696666666666666e-07, "loss": 0.0019, "num_tokens": 2265551.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.2520202696323395, "kl": 0.038098571822047234, "learning_rate": 7.693333333333333e-07, "loss": 0.0021, "num_tokens": 2265834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 142.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06367816776037216, "kl": 0.03997368738055229, "learning_rate": 7.690000000000001e-07, "loss": 0.002, "num_tokens": 2266171.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 142.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.9301414489746094, "kl": 0.08043321594595909, "learning_rate": 7.686666666666667e-07, "loss": 0.0809, "num_tokens": 2266477.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.000715200207196176, "kl": 0.0037872716784477234, "learning_rate": 7.683333333333334e-07, "loss": 0.0002, "num_tokens": 2266713.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 142.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.004985891282558441, "kl": 0.00019570887525333092, "learning_rate": 7.68e-07, "loss": 0.0, "num_tokens": 2266973.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 142.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.12844973802566528, "kl": 0.015208481345325708, "learning_rate": 7.676666666666667e-07, "loss": 0.0009, "num_tokens": 2267288.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 142.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07045193761587143, "kl": 0.0034580400679260492, "learning_rate": 7.673333333333333e-07, "loss": 0.0002, "num_tokens": 2267560.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 142.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.030599597841501236, "kl": 0.004920025123283267, "learning_rate": 7.67e-07, "loss": 0.0003, "num_tokens": 2267816.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 142.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07589931786060333, "kl": 0.017345746979117393, "learning_rate": 7.666666666666666e-07, "loss": 0.001, "num_tokens": 2268089.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 142.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02332138828933239, "kl": 0.011409349273890257, "learning_rate": 7.663333333333333e-07, "loss": 0.0006, "num_tokens": 2268349.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 142.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.4340505599975586, "kl": 0.17629918456077576, "learning_rate": 7.660000000000001e-07, "loss": 0.0885, "num_tokens": 2268744.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 142.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.009740120731294155, "kl": 0.0014854312175884843, "learning_rate": 7.656666666666667e-07, "loss": 0.0001, "num_tokens": 2269004.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 142.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0739278569817543, "kl": 0.01706059416756034, "learning_rate": 7.653333333333334e-07, "loss": 0.0009, "num_tokens": 2269328.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 142.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.000878873048350215, "kl": 0.0012810421758331358, "learning_rate": 7.65e-07, "loss": 0.0001, "num_tokens": 2269608.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 5.551203727722168, "kl": 0.026009714230895042, "learning_rate": 7.646666666666667e-07, "loss": 0.204, "num_tokens": 2269899.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 7707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 142.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04626341909170151, "kl": 0.011175478342920542, "learning_rate": 7.643333333333333e-07, "loss": 0.0006, "num_tokens": 2270223.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04338642209768295, "kl": 0.0029926609713584185, "learning_rate": 7.64e-07, "loss": 0.0001, "num_tokens": 2270502.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 142.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.014252185821533, "kl": 0.02385175507515669, "learning_rate": 7.636666666666666e-07, "loss": 0.1599, "num_tokens": 2270853.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 142.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02578860893845558, "kl": 0.0014058202505111694, "learning_rate": 7.633333333333333e-07, "loss": 0.0001, "num_tokens": 2271065.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.029581209644675255, "kl": 0.0018641411734279245, "learning_rate": 7.63e-07, "loss": 0.0001, "num_tokens": 2271284.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.12762634456157684, "kl": 0.021816055290400982, "learning_rate": 7.626666666666667e-07, "loss": 0.0011, "num_tokens": 2271570.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 142.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.193521738052368, "kl": 0.12909285724163055, "learning_rate": 7.623333333333334e-07, "loss": -0.1468, "num_tokens": 2271930.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 142.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 6.3475847244262695, "kl": 0.09858269989490509, "learning_rate": 7.620000000000001e-07, "loss": 0.0093, "num_tokens": 2272174.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 142.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0230876374989748, "kl": 0.0007108896970748901, "learning_rate": 7.616666666666666e-07, "loss": 0.0, "num_tokens": 2272382.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 142.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.029099872335791588, "kl": 0.006785084804505459, "learning_rate": 7.613333333333333e-07, "loss": 0.0003, "num_tokens": 2272654.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 142.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.027688313275575638, "kl": 0.006402334664016962, "learning_rate": 7.61e-07, "loss": 0.0003, "num_tokens": 2272944.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 142.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 3.777172565460205, "kl": 0.13309234008193016, "learning_rate": 7.606666666666666e-07, "loss": 0.0841, "num_tokens": 2273326.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 7719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 142.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.7277072668075562, "kl": 0.2800426259636879, "learning_rate": 7.603333333333335e-07, "loss": 0.0065, "num_tokens": 2273729.0, "reward": 1.75, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.4433757066726685, "step": 7720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 142.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.006836375687271357, "kl": 0.0016658224340062588, "learning_rate": 7.600000000000001e-07, "loss": 0.0001, "num_tokens": 2274025.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 143.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06354095786809921, "kl": 0.007469865377061069, "learning_rate": 7.596666666666667e-07, "loss": 0.0003, "num_tokens": 2274347.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7722 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01785714365541935, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 143.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.344735145568848, "kl": 0.9647806752473116, "learning_rate": 7.593333333333334e-07, "loss": 0.0463, "num_tokens": 2274619.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 143.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007394056883640587, "kl": 2.3886561393737793e-05, "learning_rate": 7.590000000000001e-07, "loss": 0.0, "num_tokens": 2274839.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 143.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03275536373257637, "kl": 0.005106767290271819, "learning_rate": 7.586666666666666e-07, "loss": 0.0003, "num_tokens": 2275164.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 143.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.015481347218155861, "kl": 0.0004124348779441789, "learning_rate": 7.583333333333333e-07, "loss": 0.0, "num_tokens": 2275407.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 143.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.014245212078094482, "kl": 0.0004931064031552523, "learning_rate": 7.58e-07, "loss": 0.0, "num_tokens": 2275641.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 143.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.011389863677322865, "kl": 0.0024091824889183044, "learning_rate": 7.576666666666667e-07, "loss": 0.0001, "num_tokens": 2275857.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7728 }, { "clip_ratio/high_max": 0.006493506487458944, "clip_ratio/high_mean": 0.006493506487458944, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006493506487458944, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 143.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.3838181495666504, "kl": 0.1302348356693983, "learning_rate": 7.573333333333334e-07, "loss": -0.024, "num_tokens": 2276231.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 143.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04056413844227791, "kl": 0.03344356641173363, "learning_rate": 7.570000000000001e-07, "loss": 0.0016, "num_tokens": 2276647.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 143.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.026046650484204292, "kl": 0.002310799201950431, "learning_rate": 7.566666666666667e-07, "loss": 0.0001, "num_tokens": 2276924.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 143.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04400316998362541, "kl": 0.0038310529198497534, "learning_rate": 7.563333333333334e-07, "loss": 0.0002, "num_tokens": 2277196.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 143.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03560112416744232, "kl": 0.007601238437928259, "learning_rate": 7.56e-07, "loss": 0.0004, "num_tokens": 2277489.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 143.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08714082092046738, "kl": 0.015921350568532944, "learning_rate": 7.556666666666667e-07, "loss": 0.0009, "num_tokens": 2277763.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 143.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.005249094683676958, "kl": 0.0011807740083895624, "learning_rate": 7.553333333333333e-07, "loss": 0.0001, "num_tokens": 2278023.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 143.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 2.6178362369537354, "kl": 0.20231213700026274, "learning_rate": 7.55e-07, "loss": 0.0118, "num_tokens": 2278321.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 143.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.005135569255799055, "kl": 0.0002174973487854004, "learning_rate": 7.546666666666666e-07, "loss": 0.0, "num_tokens": 2278581.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 143.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10283061861991882, "kl": 0.018414645222947, "learning_rate": 7.543333333333334e-07, "loss": 0.001, "num_tokens": 2278867.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 143.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0893920361995697, "kl": 0.04302060045301914, "learning_rate": 7.540000000000001e-07, "loss": 0.0023, "num_tokens": 2279261.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 143.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.023387538269162178, "kl": 0.0018110059027094394, "learning_rate": 7.536666666666667e-07, "loss": 0.0001, "num_tokens": 2279557.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 143.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02173694223165512, "kl": 0.0010395422723377123, "learning_rate": 7.533333333333334e-07, "loss": 0.0001, "num_tokens": 2279831.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7741 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 143.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.6484451293945312, "kl": 0.09152450412511826, "learning_rate": 7.53e-07, "loss": 0.0785, "num_tokens": 2280142.0, "reward": 7.375, "reward_std": 0.25, "rewards/reward_combined/mean": 7.375, "rewards/reward_combined/std": 0.25, "step": 7742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 143.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.9953925609588623, "kl": 0.006105820881202817, "learning_rate": 7.526666666666667e-07, "loss": 0.092, "num_tokens": 2280485.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 143.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.15873970091342926, "kl": 0.026338991709053516, "learning_rate": 7.523333333333333e-07, "loss": 0.0014, "num_tokens": 2280812.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 143.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.012160657905042171, "kl": 0.007804167224094272, "learning_rate": 7.52e-07, "loss": 0.0004, "num_tokens": 2281084.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 143.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.15064144134521484, "kl": 0.0025098994374275208, "learning_rate": 7.516666666666666e-07, "loss": 0.0002, "num_tokens": 2281294.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 143.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01748109981417656, "kl": 0.0004294489699532278, "learning_rate": 7.513333333333334e-07, "loss": 0.0, "num_tokens": 2281564.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 143.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.024861948564648628, "kl": 0.006381360813975334, "learning_rate": 7.510000000000001e-07, "loss": 0.0003, "num_tokens": 2281854.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 143.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02653603069484234, "kl": 0.002868482959456742, "learning_rate": 7.506666666666668e-07, "loss": 0.0001, "num_tokens": 2282125.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 143.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.026265760883688927, "kl": 0.001516878604888916, "learning_rate": 7.503333333333333e-07, "loss": 0.0001, "num_tokens": 2282337.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 143.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02322395145893097, "kl": 0.011362049262970686, "learning_rate": 7.5e-07, "loss": 0.0006, "num_tokens": 2282597.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 143.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02803313359618187, "kl": 0.0050208475440740585, "learning_rate": 7.496666666666667e-07, "loss": 0.0003, "num_tokens": 2282901.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 143.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.002462148666382, "kl": 0.042132167145609856, "learning_rate": 7.493333333333333e-07, "loss": 0.0558, "num_tokens": 2283184.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 143.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.28658008575439453, "kl": 0.08020685985684395, "learning_rate": 7.49e-07, "loss": 0.0039, "num_tokens": 2283529.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 143.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.8932595252990723, "kl": 0.023721948266029358, "learning_rate": 7.486666666666667e-07, "loss": 0.0008, "num_tokens": 2283855.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 143.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.014380471780896187, "kl": 0.15950369089841843, "learning_rate": 7.483333333333333e-07, "loss": 0.008, "num_tokens": 2284165.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 143.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.19520796835422516, "kl": 0.04966457188129425, "learning_rate": 7.48e-07, "loss": 0.0025, "num_tokens": 2284507.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 143.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037564807571470737, "kl": 0.0002024372515734285, "learning_rate": 7.476666666666668e-07, "loss": 0.0, "num_tokens": 2284819.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 143.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.17725761234760284, "kl": 0.0379670774564147, "learning_rate": 7.473333333333333e-07, "loss": 0.0017, "num_tokens": 2285164.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 143.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.057189084589481354, "kl": 0.009764651767909527, "learning_rate": 7.47e-07, "loss": 0.0005, "num_tokens": 2285476.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 143.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.5905125737190247, "kl": 0.04659403022378683, "learning_rate": 7.466666666666667e-07, "loss": 0.0023, "num_tokens": 2285808.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 143.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.032066769897937775, "kl": 0.0002530887722969055, "learning_rate": 7.463333333333334e-07, "loss": 0.0, "num_tokens": 2286021.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 143.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.012608855031430721, "kl": 0.0001500278667663224, "learning_rate": 7.46e-07, "loss": 0.0, "num_tokens": 2286277.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009803921915590763, "clip_ratio/low_min": 0.009803921915590763, "clip_ratio/region_mean": 0.009803921915590763, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 143.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 8.283246994018555, "kl": 0.03131023049354553, "learning_rate": 7.456666666666667e-07, "loss": 0.0411, "num_tokens": 2286602.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 7764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 143.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.028160201385617256, "kl": 0.0009720735251903534, "learning_rate": 7.453333333333334e-07, "loss": 0.0, "num_tokens": 2286862.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 143.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.048435479402542114, "kl": 0.040613481774926186, "learning_rate": 7.45e-07, "loss": 0.002, "num_tokens": 2287172.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 143.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.008940286934375763, "kl": 0.09814896434545517, "learning_rate": 7.446666666666667e-07, "loss": 0.0049, "num_tokens": 2287544.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 143.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.004764673765748739, "kl": 0.0009603133366908878, "learning_rate": 7.443333333333333e-07, "loss": 0.0, "num_tokens": 2287828.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 143.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.014307666569948196, "kl": 0.0007961370865814388, "learning_rate": 7.44e-07, "loss": 0.0, "num_tokens": 2288117.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 143.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.004489877261221409, "kl": 0.0002865791320800781, "learning_rate": 7.436666666666667e-07, "loss": 0.0, "num_tokens": 2288337.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 143.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 2.451237201690674, "kl": 0.778635174036026, "learning_rate": 7.433333333333333e-07, "loss": 0.0713, "num_tokens": 2288643.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 143.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007532176678068936, "kl": 0.003782317042350769, "learning_rate": 7.43e-07, "loss": 0.0002, "num_tokens": 2288879.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 143.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.05567490682005882, "kl": 0.0043501444888534024, "learning_rate": 7.426666666666667e-07, "loss": 0.0002, "num_tokens": 2289198.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 143.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.4712536334991455, "kl": 0.11329841800034046, "learning_rate": 7.423333333333334e-07, "loss": 0.0058, "num_tokens": 2289532.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 143.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224059522151947, "kl": 0.0027415878139436245, "learning_rate": 7.42e-07, "loss": 0.0001, "num_tokens": 2289797.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 144.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04243478178977966, "kl": 0.01574730360880494, "learning_rate": 7.416666666666667e-07, "loss": 0.0008, "num_tokens": 2290096.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 144.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.020603708922863007, "kl": 0.0033009694889187813, "learning_rate": 7.413333333333334e-07, "loss": 0.0002, "num_tokens": 2290387.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007747675408609211, "kl": 0.003777734935283661, "learning_rate": 7.41e-07, "loss": 0.0002, "num_tokens": 2290623.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 144.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.004194805398583412, "kl": 0.0015745406853966415, "learning_rate": 7.406666666666667e-07, "loss": 0.0001, "num_tokens": 2290919.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09947057068347931, "kl": 0.004641398787498474, "learning_rate": 7.403333333333334e-07, "loss": 0.0002, "num_tokens": 2291139.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 144.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.009100621566176414, "kl": 0.0005209706723690033, "learning_rate": 7.4e-07, "loss": 0.0, "num_tokens": 2291399.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 144.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07040928304195404, "kl": 0.011888418346643448, "learning_rate": 7.396666666666667e-07, "loss": 0.0006, "num_tokens": 2291713.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 144.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 1.9421210289001465, "kl": 0.24796737730503082, "learning_rate": 7.393333333333334e-07, "loss": 0.0007, "num_tokens": 2292084.0, "reward": 2.375, "reward_std": 2.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 2.25, "step": 7783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 144.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.027161186560988426, "kl": 0.0025201529497280717, "learning_rate": 7.389999999999999e-07, "loss": 0.0001, "num_tokens": 2292344.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 144.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 6.503671169281006, "kl": 0.6986650750041008, "learning_rate": 7.386666666666667e-07, "loss": 0.0457, "num_tokens": 2292658.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 144.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.025138195604085922, "kl": 0.0013024210929870605, "learning_rate": 7.383333333333334e-07, "loss": 0.0001, "num_tokens": 2292870.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 144.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03400333598256111, "kl": 0.009914604714140296, "learning_rate": 7.38e-07, "loss": 0.0005, "num_tokens": 2293156.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 144.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.15238557755947113, "kl": 0.025332734920084476, "learning_rate": 7.376666666666666e-07, "loss": 0.0013, "num_tokens": 2293456.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 144.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.018772460520267487, "kl": 0.0027480390563141555, "learning_rate": 7.373333333333334e-07, "loss": 0.0001, "num_tokens": 2293722.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 144.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04934407398104668, "kl": 0.042495377361774445, "learning_rate": 7.37e-07, "loss": 0.0021, "num_tokens": 2294024.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.011766253970563412, "kl": 0.002495020627975464, "learning_rate": 7.366666666666667e-07, "loss": 0.0001, "num_tokens": 2294240.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 144.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.018804864957928658, "kl": 0.003282279008999467, "learning_rate": 7.363333333333333e-07, "loss": 0.0002, "num_tokens": 2294530.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 144.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09307824075222015, "kl": 0.01906169019639492, "learning_rate": 7.36e-07, "loss": 0.001, "num_tokens": 2294802.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 144.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01951969601213932, "kl": 0.0034652543254196644, "learning_rate": 7.356666666666667e-07, "loss": 0.0002, "num_tokens": 2295123.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 144.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07716208696365356, "kl": 0.047879163175821304, "learning_rate": 7.353333333333334e-07, "loss": 0.0024, "num_tokens": 2295497.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 144.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.1274804174900055, "kl": 0.013172006234526634, "learning_rate": 7.350000000000001e-07, "loss": 0.001, "num_tokens": 2295855.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 144.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05018867924809456, "kl": 0.059575026854872704, "learning_rate": 7.346666666666666e-07, "loss": 0.003, "num_tokens": 2296230.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 144.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.05815522000193596, "kl": 0.01121479517314583, "learning_rate": 7.343333333333334e-07, "loss": 0.0005, "num_tokens": 2296532.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 144.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.3417277336120605, "kl": 0.14042966067790985, "learning_rate": 7.34e-07, "loss": 0.0087, "num_tokens": 2296879.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 7799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 144.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02039925754070282, "kl": 0.0007336969720199704, "learning_rate": 7.336666666666667e-07, "loss": 0.0, "num_tokens": 2297115.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 144.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.040640510618686676, "kl": 0.0008734017610549927, "learning_rate": 7.333333333333333e-07, "loss": 0.0, "num_tokens": 2297325.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 144.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04618072509765625, "kl": 0.012975095305591822, "learning_rate": 7.33e-07, "loss": 0.0006, "num_tokens": 2297660.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 144.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02459227293729782, "kl": 0.001953233266249299, "learning_rate": 7.326666666666667e-07, "loss": 0.0001, "num_tokens": 2297931.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 144.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069475783966481686, "kl": 0.1613440364599228, "learning_rate": 7.323333333333334e-07, "loss": 0.0081, "num_tokens": 2298240.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 144.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00587360979989171, "kl": 0.0003445446491241455, "learning_rate": 7.32e-07, "loss": 0.0, "num_tokens": 2298500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 144.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 6.115880489349365, "kl": 0.011238036211580038, "learning_rate": 7.316666666666666e-07, "loss": 0.0466, "num_tokens": 2298835.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 144.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04636514186859131, "kl": 0.013248456176370382, "learning_rate": 7.313333333333334e-07, "loss": 0.0007, "num_tokens": 2299109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 144.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.1576414853334427, "kl": 0.0263519324362278, "learning_rate": 7.310000000000001e-07, "loss": 0.0014, "num_tokens": 2299379.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 144.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004367793910205364, "kl": 0.0008925153524614871, "learning_rate": 7.306666666666666e-07, "loss": 0.0, "num_tokens": 2299663.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 144.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04627133160829544, "kl": 0.05415169894695282, "learning_rate": 7.303333333333333e-07, "loss": 0.0027, "num_tokens": 2299999.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 144.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.28215914964675903, "kl": 0.01585837733000517, "learning_rate": 7.300000000000001e-07, "loss": 0.0008, "num_tokens": 2300334.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 144.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.035695239901542664, "kl": 0.04531832970678806, "learning_rate": 7.296666666666667e-07, "loss": 0.0023, "num_tokens": 2300738.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 144.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.765526294708252, "kl": 0.06695375754497945, "learning_rate": 7.293333333333334e-07, "loss": 0.1279, "num_tokens": 2301075.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 144.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07994627207517624, "kl": 0.03670547902584076, "learning_rate": 7.29e-07, "loss": 0.0017, "num_tokens": 2301425.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 144.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.09260185062885284, "kl": 0.016643074341118336, "learning_rate": 7.286666666666666e-07, "loss": 0.0008, "num_tokens": 2301724.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 144.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.13086636364459991, "kl": 0.018740601604804397, "learning_rate": 7.283333333333334e-07, "loss": 0.001, "num_tokens": 2301998.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 144.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.004347720183432102, "kl": 0.00014990071940701455, "learning_rate": 7.280000000000001e-07, "loss": 0.0, "num_tokens": 2302270.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 144.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.20595932006835938, "kl": 0.031236987560987473, "learning_rate": 7.276666666666666e-07, "loss": 0.0017, "num_tokens": 2302601.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02995908074080944, "kl": 0.0037928506499156356, "learning_rate": 7.273333333333333e-07, "loss": 0.0002, "num_tokens": 2302818.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 144.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007915023015812039, "kl": 0.0012798905372619629, "learning_rate": 7.270000000000001e-07, "loss": 0.0001, "num_tokens": 2303098.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 144.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.010825767181813717, "kl": 0.0004008780015283264, "learning_rate": 7.266666666666667e-07, "loss": 0.0, "num_tokens": 2303412.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 144.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.8050737380981445, "kl": 0.9044334143400192, "learning_rate": 7.263333333333333e-07, "loss": 0.064, "num_tokens": 2303685.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 144.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03513854742050171, "kl": 0.0018830194603651762, "learning_rate": 7.26e-07, "loss": 0.0001, "num_tokens": 2303957.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 144.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.059505682438611984, "kl": 0.015135690569877625, "learning_rate": 7.256666666666667e-07, "loss": 0.0008, "num_tokens": 2304280.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 144.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.05364292487502098, "kl": 0.0028927183302585036, "learning_rate": 7.253333333333334e-07, "loss": 0.0002, "num_tokens": 2304556.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 144.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.003947919700294733, "kl": 4.607439041137695e-05, "learning_rate": 7.25e-07, "loss": 0.0, "num_tokens": 2304812.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 144.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.014857267960906029, "kl": 0.0005029442836530507, "learning_rate": 7.246666666666667e-07, "loss": 0.0, "num_tokens": 2305061.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07076448947191238, "kl": 0.0010878369212150574, "learning_rate": 7.243333333333333e-07, "loss": 0.0001, "num_tokens": 2305273.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 144.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0993768721818924, "kl": 0.022278862074017525, "learning_rate": 7.240000000000001e-07, "loss": 0.0012, "num_tokens": 2305559.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 145.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01657954230904579, "kl": 0.2657916694879532, "learning_rate": 7.236666666666666e-07, "loss": 0.0133, "num_tokens": 2305863.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 145.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.016393275931477547, "kl": 0.001748779322952032, "learning_rate": 7.233333333333333e-07, "loss": 0.0001, "num_tokens": 2306140.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 145.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.016857551410794258, "kl": 0.2657901346683502, "learning_rate": 7.23e-07, "loss": 0.0133, "num_tokens": 2306444.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 145.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.028873370960354805, "kl": 0.012613944243639708, "learning_rate": 7.226666666666667e-07, "loss": 0.0007, "num_tokens": 2306718.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 145.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.027607008814811707, "kl": 0.0006582169444300234, "learning_rate": 7.223333333333334e-07, "loss": 0.0, "num_tokens": 2306951.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 145.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 4.604893684387207, "kl": 0.028711873339489102, "learning_rate": 7.22e-07, "loss": 0.0751, "num_tokens": 2307298.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 145.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08080136775970459, "kl": 0.019822733476758003, "learning_rate": 7.216666666666667e-07, "loss": 0.001, "num_tokens": 2307623.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 145.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.13860845565795898, "kl": 0.0819106437265873, "learning_rate": 7.213333333333333e-07, "loss": 0.0042, "num_tokens": 2308020.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 145.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.024136235937476158, "kl": 0.001156538724899292, "learning_rate": 7.210000000000001e-07, "loss": 0.0001, "num_tokens": 2308232.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 145.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03194114938378334, "kl": 0.007562480866909027, "learning_rate": 7.206666666666667e-07, "loss": 0.0004, "num_tokens": 2308521.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 145.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01061325054615736, "kl": 0.0002386033520451747, "learning_rate": 7.203333333333333e-07, "loss": 0.0, "num_tokens": 2308791.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 145.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.523732662200928, "kl": 0.09044819604605436, "learning_rate": 7.2e-07, "loss": 0.0406, "num_tokens": 2309089.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 145.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.003621538169682026, "kl": 0.0014243125915527344, "learning_rate": 7.196666666666668e-07, "loss": 0.0001, "num_tokens": 2309349.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 145.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03520356863737106, "kl": 0.04522598721086979, "learning_rate": 7.193333333333333e-07, "loss": 0.0023, "num_tokens": 2309753.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 145.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.020660918205976486, "kl": 0.011906451545655727, "learning_rate": 7.19e-07, "loss": 0.0006, "num_tokens": 2310013.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.012060144916176796, "kl": 0.0073663960210978985, "learning_rate": 7.186666666666667e-07, "loss": 0.0004, "num_tokens": 2310285.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 145.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.119589805603027, "kl": 0.026283381041139364, "learning_rate": 7.183333333333333e-07, "loss": 0.0258, "num_tokens": 2310614.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 145.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005029130261391401, "kl": 1.4007091522216797e-05, "learning_rate": 7.18e-07, "loss": 0.0, "num_tokens": 2310834.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 145.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.029953503981232643, "kl": 0.0006314888596534729, "learning_rate": 7.176666666666667e-07, "loss": 0.0, "num_tokens": 2311042.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 145.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.011209409683942795, "kl": 0.0023278817534446716, "learning_rate": 7.173333333333333e-07, "loss": 0.0001, "num_tokens": 2311258.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.05701678991317749, "kl": 0.00676327757537365, "learning_rate": 7.17e-07, "loss": 0.0003, "num_tokens": 2311562.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 145.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006832170765846968, "kl": 0.0004152357578277588, "learning_rate": 7.166666666666668e-07, "loss": 0.0, "num_tokens": 2311822.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 145.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0734308585524559, "kl": 0.02048661932349205, "learning_rate": 7.163333333333333e-07, "loss": 0.001, "num_tokens": 2312116.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 145.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06470665335655212, "kl": 0.018842053599655628, "learning_rate": 7.16e-07, "loss": 0.0009, "num_tokens": 2312390.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 145.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.06588394194841385, "kl": 0.007601095596328378, "learning_rate": 7.156666666666667e-07, "loss": 0.0004, "num_tokens": 2312704.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 145.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.023730188608169556, "kl": 0.003889709711074829, "learning_rate": 7.153333333333334e-07, "loss": 0.0002, "num_tokens": 2312977.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 145.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.017198164016008377, "kl": 0.0042624999769032, "learning_rate": 7.15e-07, "loss": 0.0002, "num_tokens": 2313315.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 145.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008212963584810495, "kl": 0.0037628933787345886, "learning_rate": 7.146666666666667e-07, "loss": 0.0002, "num_tokens": 2313551.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 145.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.019984684884548187, "kl": 0.0012029930367134511, "learning_rate": 7.143333333333334e-07, "loss": 0.0001, "num_tokens": 2313874.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 145.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.683157920837402, "kl": 0.17300502955913544, "learning_rate": 7.14e-07, "loss": 0.1559, "num_tokens": 2314197.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 3.800112247467041, "kl": 0.006968340370804071, "learning_rate": 7.136666666666667e-07, "loss": 0.085, "num_tokens": 2314470.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 145.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04471684247255325, "kl": 0.04221216402947903, "learning_rate": 7.133333333333333e-07, "loss": 0.0021, "num_tokens": 2314770.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 145.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03296776860952377, "kl": 0.003319399431347847, "learning_rate": 7.13e-07, "loss": 0.0002, "num_tokens": 2315071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 145.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05441904440522194, "kl": 0.012024045921862125, "learning_rate": 7.126666666666667e-07, "loss": 0.0006, "num_tokens": 2315379.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 145.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.08604150265455246, "kl": 0.0030240335618145764, "learning_rate": 7.123333333333333e-07, "loss": 0.0001, "num_tokens": 2315651.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 145.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.4204092025756836, "kl": 0.11133990064263344, "learning_rate": 7.12e-07, "loss": 0.0252, "num_tokens": 2316016.0, "reward": 4.625, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 4.308422088623047, "step": 7865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 145.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.4855213165283203, "kl": 0.07751660235226154, "learning_rate": 7.116666666666667e-07, "loss": 0.0039, "num_tokens": 2316388.0, "reward": 3.375, "reward_std": 0.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 0.25, "step": 7866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 145.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.008625567890703678, "kl": 0.00034084319486282766, "learning_rate": 7.113333333333334e-07, "loss": 0.0, "num_tokens": 2316699.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 145.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04620688781142235, "kl": 0.057636771351099014, "learning_rate": 7.11e-07, "loss": 0.0029, "num_tokens": 2317032.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 145.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.006471026688814163, "kl": 0.0004990100860595703, "learning_rate": 7.106666666666667e-07, "loss": 0.0, "num_tokens": 2317292.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 145.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.019478855654597282, "kl": 0.002590879797935486, "learning_rate": 7.103333333333334e-07, "loss": 0.0001, "num_tokens": 2317580.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 145.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.012735706754028797, "kl": 0.0022063918877393007, "learning_rate": 7.1e-07, "loss": 0.0001, "num_tokens": 2317876.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 145.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.16797488927841187, "kl": 0.0031422898173332214, "learning_rate": 7.096666666666667e-07, "loss": 0.0002, "num_tokens": 2318088.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 145.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.033851008862257004, "kl": 0.0063656826969236135, "learning_rate": 7.093333333333334e-07, "loss": 0.0003, "num_tokens": 2318421.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 145.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0365244597196579, "kl": 0.043304454535245895, "learning_rate": 7.09e-07, "loss": 0.0023, "num_tokens": 2318795.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.12554596364498138, "kl": 0.007517669582739472, "learning_rate": 7.086666666666667e-07, "loss": 0.0004, "num_tokens": 2319081.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 145.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0053313616663217545, "kl": 0.0004590122262015939, "learning_rate": 7.083333333333334e-07, "loss": 0.0, "num_tokens": 2319330.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 145.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.09370862692594528, "kl": 0.00645895441994071, "learning_rate": 7.079999999999999e-07, "loss": 0.0003, "num_tokens": 2319592.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 145.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.037812262773513794, "kl": 0.00982398958876729, "learning_rate": 7.076666666666667e-07, "loss": 0.0006, "num_tokens": 2319918.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 145.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.06566604226827621, "kl": 0.001881853153463453, "learning_rate": 7.073333333333334e-07, "loss": 0.0001, "num_tokens": 2320140.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 145.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05149158462882042, "kl": 0.0015288891954696737, "learning_rate": 7.07e-07, "loss": 0.0001, "num_tokens": 2320397.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.09078837186098099, "kl": 0.010929904878139496, "learning_rate": 7.066666666666666e-07, "loss": 0.0005, "num_tokens": 2320668.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 145.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.645307540893555, "kl": 0.04936479404568672, "learning_rate": 7.063333333333334e-07, "loss": 0.2113, "num_tokens": 2321016.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.12648501992225647, "kl": 0.03250580746680498, "learning_rate": 7.06e-07, "loss": 0.0017, "num_tokens": 2321305.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 146.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.029397668316960335, "kl": 0.004287875635782257, "learning_rate": 7.056666666666667e-07, "loss": 0.0002, "num_tokens": 2321573.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.014732036739587784, "kl": 0.0015842552529647946, "learning_rate": 7.053333333333333e-07, "loss": 0.0001, "num_tokens": 2321847.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 146.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09335663169622421, "kl": 0.005483564687892795, "learning_rate": 7.05e-07, "loss": 0.0003, "num_tokens": 2322107.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 146.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.5405759811401367, "kl": 0.047873176634311676, "learning_rate": 7.046666666666667e-07, "loss": 0.0831, "num_tokens": 2322442.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 146.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.6981112957000732, "kl": 0.006339870858937502, "learning_rate": 7.043333333333334e-07, "loss": 0.2224, "num_tokens": 2322805.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 146.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.036600496619939804, "kl": 0.0019297749386169016, "learning_rate": 7.040000000000001e-07, "loss": 0.0001, "num_tokens": 2323073.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 146.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.11252889037132263, "kl": 0.02112907450646162, "learning_rate": 7.036666666666666e-07, "loss": 0.001, "num_tokens": 2323378.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 146.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02057185396552086, "kl": 0.011972520500421524, "learning_rate": 7.033333333333334e-07, "loss": 0.0006, "num_tokens": 2323638.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 146.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01874908246099949, "kl": 0.004027571063488722, "learning_rate": 7.03e-07, "loss": 0.0002, "num_tokens": 2323896.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 146.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007877022144384682, "kl": 0.0037712156772613525, "learning_rate": 7.026666666666667e-07, "loss": 0.0002, "num_tokens": 2324132.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 146.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09150916337966919, "kl": 0.03694234415888786, "learning_rate": 7.023333333333333e-07, "loss": 0.0018, "num_tokens": 2324448.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 146.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.1805944442749023, "kl": 0.14544594287872314, "learning_rate": 7.02e-07, "loss": 0.0437, "num_tokens": 2324822.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 146.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03395191207528114, "kl": 0.1617203652858734, "learning_rate": 7.016666666666667e-07, "loss": 0.0081, "num_tokens": 2325132.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 146.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005269645480439067, "kl": 2.358853816986084e-05, "learning_rate": 7.013333333333334e-07, "loss": 0.0, "num_tokens": 2325344.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 146.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04336988925933838, "kl": 0.006349961506202817, "learning_rate": 7.01e-07, "loss": 0.0003, "num_tokens": 2325637.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 146.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09106840938329697, "kl": 0.006232064217329025, "learning_rate": 7.006666666666666e-07, "loss": 0.0003, "num_tokens": 2325852.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 146.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.028742168098688126, "kl": 0.00420410861261189, "learning_rate": 7.003333333333334e-07, "loss": 0.0002, "num_tokens": 2326196.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 146.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.8615872859954834, "kl": 0.01856350596062839, "learning_rate": 7.000000000000001e-07, "loss": 0.0004, "num_tokens": 2326525.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 7901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 146.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03740246221423149, "kl": 0.0005555689131142572, "learning_rate": 6.996666666666666e-07, "loss": 0.0, "num_tokens": 2326781.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 146.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02081618830561638, "kl": 0.047073764726519585, "learning_rate": 6.993333333333333e-07, "loss": 0.0024, "num_tokens": 2327185.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 146.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.065969228744507, "kl": 0.07833204790949821, "learning_rate": 6.990000000000001e-07, "loss": -0.0042, "num_tokens": 2327561.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.022117936983704567, "kl": 0.001973973121494055, "learning_rate": 6.986666666666667e-07, "loss": 0.0001, "num_tokens": 2327857.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 146.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03592696040868759, "kl": 0.003980218549259007, "learning_rate": 6.983333333333334e-07, "loss": 0.0002, "num_tokens": 2328147.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 146.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.016566133126616478, "kl": 0.26588982343673706, "learning_rate": 6.98e-07, "loss": 0.0133, "num_tokens": 2328451.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 146.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.009487488307058811, "kl": 0.00019461263946141116, "learning_rate": 6.976666666666666e-07, "loss": 0.0, "num_tokens": 2328721.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 146.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.005486609879881144, "kl": 0.0003608107508625835, "learning_rate": 6.973333333333334e-07, "loss": 0.0, "num_tokens": 2328941.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 146.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.005849889479577541, "kl": 0.001218301069457084, "learning_rate": 6.970000000000001e-07, "loss": 0.0001, "num_tokens": 2329201.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.010366600006818771, "kl": 0.0013481086352840066, "learning_rate": 6.966666666666666e-07, "loss": 0.0001, "num_tokens": 2329471.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 146.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0389113612473011, "kl": 0.023123985156416893, "learning_rate": 6.963333333333333e-07, "loss": 0.0012, "num_tokens": 2329745.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 146.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05154503136873245, "kl": 0.04030958376824856, "learning_rate": 6.960000000000001e-07, "loss": 0.002, "num_tokens": 2330056.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 146.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.01767592318356037, "kl": 0.0008933462668210268, "learning_rate": 6.956666666666667e-07, "loss": 0.0, "num_tokens": 2330332.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 146.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.036674413830041885, "kl": 0.07475689984858036, "learning_rate": 6.953333333333333e-07, "loss": 0.0038, "num_tokens": 2330702.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 146.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.008082011714577675, "kl": 0.0004647746682167053, "learning_rate": 6.95e-07, "loss": 0.0, "num_tokens": 2330962.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 146.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02330547571182251, "kl": 0.0010652343335095793, "learning_rate": 6.946666666666667e-07, "loss": 0.0, "num_tokens": 2331178.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 146.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.09168670326471329, "kl": 0.019862588495016098, "learning_rate": 6.943333333333334e-07, "loss": 0.001, "num_tokens": 2331475.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.033646319061517715, "kl": 0.0031009033555164933, "learning_rate": 6.94e-07, "loss": 0.0002, "num_tokens": 2331749.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 146.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.009794164448976517, "kl": 0.00040640901715960354, "learning_rate": 6.936666666666667e-07, "loss": 0.0, "num_tokens": 2332067.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.022331416606903076, "kl": 0.006107160821557045, "learning_rate": 6.933333333333333e-07, "loss": 0.0003, "num_tokens": 2332335.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 146.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07560805231332779, "kl": 0.017447875812649727, "learning_rate": 6.930000000000001e-07, "loss": 0.001, "num_tokens": 2332617.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 146.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.026410184800624847, "kl": 0.007153031183406711, "learning_rate": 6.926666666666666e-07, "loss": 0.0003, "num_tokens": 2332909.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 146.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.007500403095036745, "kl": 0.0003406302275834605, "learning_rate": 6.923333333333333e-07, "loss": 0.0, "num_tokens": 2333158.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 146.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.053788814693689346, "kl": 0.04855903051793575, "learning_rate": 6.92e-07, "loss": 0.0024, "num_tokens": 2333495.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 146.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0677071139216423, "kl": 0.016340465284883976, "learning_rate": 6.916666666666667e-07, "loss": 0.0008, "num_tokens": 2333819.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 146.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.01156686432659626, "kl": 0.0026587173342704773, "learning_rate": 6.913333333333334e-07, "loss": 0.0001, "num_tokens": 2334035.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 146.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.2613486349582672, "kl": 0.0367953865788877, "learning_rate": 6.91e-07, "loss": 0.0018, "num_tokens": 2334339.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 146.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03902868181467056, "kl": 0.009209196548908949, "learning_rate": 6.906666666666667e-07, "loss": 0.0005, "num_tokens": 2334670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 146.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05344216525554657, "kl": 0.011436731845606118, "learning_rate": 6.903333333333333e-07, "loss": 0.0006, "num_tokens": 2334957.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 146.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.050553612411022186, "kl": 0.008439704310148954, "learning_rate": 6.900000000000001e-07, "loss": 0.0004, "num_tokens": 2335259.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 146.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030578509904444218, "kl": 0.0001486109395045787, "learning_rate": 6.896666666666667e-07, "loss": 0.0, "num_tokens": 2335571.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 146.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 1.7461144924163818, "kl": 0.11521635204553604, "learning_rate": 6.893333333333333e-07, "loss": 0.0369, "num_tokens": 2335955.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 146.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01660105399787426, "kl": 0.0004939181380905211, "learning_rate": 6.89e-07, "loss": 0.0, "num_tokens": 2336188.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 146.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.00032070622546598315, "kl": 8.128583431243896e-06, "learning_rate": 6.886666666666668e-07, "loss": 0.0, "num_tokens": 2336408.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 146.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.055148590356111526, "kl": 0.008990629576146603, "learning_rate": 6.883333333333333e-07, "loss": 0.0004, "num_tokens": 2336720.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 146.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.12067971378564835, "kl": 0.033998752012848854, "learning_rate": 6.88e-07, "loss": 0.0018, "num_tokens": 2337008.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.2857086658477783, "kl": 0.02930644527077675, "learning_rate": 6.876666666666667e-07, "loss": 0.0015, "num_tokens": 2337290.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 147.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03171192482113838, "kl": 0.0007163698101066984, "learning_rate": 6.873333333333333e-07, "loss": 0.0, "num_tokens": 2337547.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.10451661795377731, "kl": 0.024647328886203468, "learning_rate": 6.87e-07, "loss": 0.0012, "num_tokens": 2337837.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 147.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.17091840505599976, "kl": 0.008172134403139353, "learning_rate": 6.866666666666667e-07, "loss": 0.0006, "num_tokens": 2338064.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 147.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.3531803786754608, "kl": 0.01787441223859787, "learning_rate": 6.863333333333333e-07, "loss": 0.0012, "num_tokens": 2338318.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 147.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.030056845396757126, "kl": 0.006302210036665201, "learning_rate": 6.86e-07, "loss": 0.0003, "num_tokens": 2338586.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 147.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00673312321305275, "kl": 0.163905531167984, "learning_rate": 6.856666666666668e-07, "loss": 0.0082, "num_tokens": 2338894.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 147.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.22499795258045197, "kl": 0.01883025059942156, "learning_rate": 6.853333333333333e-07, "loss": 0.001, "num_tokens": 2339192.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 147.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1145038977265358, "kl": 0.022760297171771526, "learning_rate": 6.85e-07, "loss": 0.0011, "num_tokens": 2339492.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 147.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.6732354164123535, "kl": 0.6013218630105257, "learning_rate": 6.846666666666667e-07, "loss": 0.0609, "num_tokens": 2339831.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 147.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.6160852909088135, "kl": 0.12218927592039108, "learning_rate": 6.843333333333334e-07, "loss": -0.0094, "num_tokens": 2340195.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 147.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00030729122227057815, "kl": 6.951391696929932e-06, "learning_rate": 6.84e-07, "loss": 0.0, "num_tokens": 2340415.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 147.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.13470609486103058, "kl": 0.013342326506972313, "learning_rate": 6.836666666666667e-07, "loss": 0.0008, "num_tokens": 2340675.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 147.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.028726885095238686, "kl": 0.001210954214911908, "learning_rate": 6.833333333333333e-07, "loss": 0.0001, "num_tokens": 2340945.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 147.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.028139915317296982, "kl": 0.00025150924921035767, "learning_rate": 6.83e-07, "loss": 0.0, "num_tokens": 2341157.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06457463651895523, "kl": 0.006834480445832014, "learning_rate": 6.826666666666667e-07, "loss": 0.0003, "num_tokens": 2341455.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 147.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10724636167287827, "kl": 0.015549950301647186, "learning_rate": 6.823333333333333e-07, "loss": 0.0008, "num_tokens": 2341731.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 147.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.009234708733856678, "kl": 0.0004986152052879333, "learning_rate": 6.82e-07, "loss": 0.0, "num_tokens": 2341991.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 147.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.05288499593734741, "kl": 0.011790297867264599, "learning_rate": 6.816666666666667e-07, "loss": 0.0006, "num_tokens": 2342310.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 147.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005728833493776619, "kl": 0.0012573727872222662, "learning_rate": 6.813333333333333e-07, "loss": 0.0001, "num_tokens": 2342590.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 147.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007495522731915116, "kl": 0.00378631055355072, "learning_rate": 6.81e-07, "loss": 0.0002, "num_tokens": 2342826.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 147.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02170245349407196, "kl": 0.0016181372193386778, "learning_rate": 6.806666666666667e-07, "loss": 0.0001, "num_tokens": 2343149.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 147.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.252466082572937, "kl": 0.02959541231393814, "learning_rate": 6.803333333333334e-07, "loss": 0.0023, "num_tokens": 2343412.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 147.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.027295619249343872, "kl": 0.000490233302116394, "learning_rate": 6.8e-07, "loss": 0.0, "num_tokens": 2343622.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.11848331242799759, "kl": 0.014552897773683071, "learning_rate": 6.796666666666667e-07, "loss": 0.0008, "num_tokens": 2343898.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 147.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.029056072235107422, "kl": 0.003153599624056369, "learning_rate": 6.793333333333334e-07, "loss": 0.0001, "num_tokens": 2344164.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008045547641813755, "kl": 0.0008295339066535234, "learning_rate": 6.79e-07, "loss": 0.0, "num_tokens": 2344448.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 147.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03112046979367733, "kl": 0.044505782425403595, "learning_rate": 6.786666666666667e-07, "loss": 0.0022, "num_tokens": 2344852.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 147.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.16668641567230225, "kl": 0.08013206720352173, "learning_rate": 6.783333333333334e-07, "loss": 0.004, "num_tokens": 2345242.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 147.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830543041229248, "kl": 0.015461879782378674, "learning_rate": 6.78e-07, "loss": 0.0008, "num_tokens": 2345571.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 147.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03035755455493927, "kl": 0.0059231220511719584, "learning_rate": 6.776666666666667e-07, "loss": 0.0004, "num_tokens": 2345928.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 147.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 9.366961479187012, "kl": 0.04303776248707436, "learning_rate": 6.773333333333334e-07, "loss": 0.2485, "num_tokens": 2346158.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 147.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.06187133118510246, "kl": 0.0033652736456133425, "learning_rate": 6.769999999999999e-07, "loss": 0.0002, "num_tokens": 2346480.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 147.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.013169502839446068, "kl": 0.0004002231962658698, "learning_rate": 6.766666666666667e-07, "loss": 0.0, "num_tokens": 2346714.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 147.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.01761413738131523, "kl": 0.003219752514269203, "learning_rate": 6.763333333333334e-07, "loss": 0.0002, "num_tokens": 2347003.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 147.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.045347876846790314, "kl": 0.00628051976673305, "learning_rate": 6.76e-07, "loss": 0.0003, "num_tokens": 2347295.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 147.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.11145929992198944, "kl": 0.0370666328817606, "learning_rate": 6.756666666666666e-07, "loss": 0.0019, "num_tokens": 2347632.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.026025963947176933, "kl": 0.009753241203725338, "learning_rate": 6.753333333333334e-07, "loss": 0.0005, "num_tokens": 2347918.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 147.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0303365346044302, "kl": 0.0010238439062959515, "learning_rate": 6.75e-07, "loss": 0.0001, "num_tokens": 2348224.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 147.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.016308927908539772, "kl": 0.2659081071615219, "learning_rate": 6.746666666666667e-07, "loss": 0.0133, "num_tokens": 2348528.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 147.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.01003574300557375, "kl": 0.00020775644952664152, "learning_rate": 6.743333333333333e-07, "loss": 0.0, "num_tokens": 2348798.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 147.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0372542105615139, "kl": 0.00859007053077221, "learning_rate": 6.74e-07, "loss": 0.0004, "num_tokens": 2349155.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 147.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.595075607299805, "kl": 0.04293276369571686, "learning_rate": 6.736666666666667e-07, "loss": -0.0044, "num_tokens": 2349470.0, "reward": 2.0, "reward_std": 2.4494898319244385, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 2.4494898319244385, "step": 7980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.012321427464485168, "kl": 0.00736632477492094, "learning_rate": 6.733333333333334e-07, "loss": 0.0004, "num_tokens": 2349742.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 147.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.005781739484518766, "kl": 0.00030984529439592734, "learning_rate": 6.73e-07, "loss": 0.0, "num_tokens": 2350004.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 147.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.13076011836528778, "kl": 0.014736629091203213, "learning_rate": 6.726666666666666e-07, "loss": 0.0007, "num_tokens": 2350299.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 147.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.022307464852929115, "kl": 0.003343239426612854, "learning_rate": 6.723333333333334e-07, "loss": 0.0002, "num_tokens": 2350572.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 147.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.07655911147594452, "kl": 0.004491940140724182, "learning_rate": 6.72e-07, "loss": 0.0002, "num_tokens": 2350840.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 147.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0214406568557024, "kl": 0.01185589050874114, "learning_rate": 6.716666666666667e-07, "loss": 0.0006, "num_tokens": 2351100.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.5018789768218994, "kl": 0.1297360584139824, "learning_rate": 6.713333333333333e-07, "loss": 0.0063, "num_tokens": 2351414.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 147.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06903313845396042, "kl": 0.03757285699248314, "learning_rate": 6.71e-07, "loss": 0.0021, "num_tokens": 2351795.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.25, "completions/mean_terminated_length": 2.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 147.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 21.11982536315918, "kl": 0.12326832115650177, "learning_rate": 6.706666666666667e-07, "loss": 0.0888, "num_tokens": 2352008.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 7989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 147.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.008687658235430717, "kl": 0.09840095788240433, "learning_rate": 6.703333333333334e-07, "loss": 0.0049, "num_tokens": 2352380.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 147.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 2.0136325359344482, "kl": 0.23587112640962005, "learning_rate": 6.7e-07, "loss": 0.0135, "num_tokens": 2352711.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 148.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.059992823749780655, "kl": 0.009034299524500966, "learning_rate": 6.696666666666666e-07, "loss": 0.0005, "num_tokens": 2353025.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 148.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.013757170177996159, "kl": 0.00013109147403156385, "learning_rate": 6.693333333333334e-07, "loss": 0.0, "num_tokens": 2353281.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 148.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.5506874918937683, "kl": 0.15094274282455444, "learning_rate": 6.690000000000001e-07, "loss": 0.007, "num_tokens": 2353630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.027551544830203056, "kl": 0.002665444160811603, "learning_rate": 6.686666666666666e-07, "loss": 0.0001, "num_tokens": 2353903.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 148.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07488074153661728, "kl": 0.03005118388682604, "learning_rate": 6.683333333333333e-07, "loss": 0.0015, "num_tokens": 2354175.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 148.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.05191029980778694, "kl": 0.0031926408410072327, "learning_rate": 6.680000000000001e-07, "loss": 0.0002, "num_tokens": 2354419.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008011865429580212, "kl": 0.0037679076194763184, "learning_rate": 6.676666666666667e-07, "loss": 0.0002, "num_tokens": 2354655.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.009269864298403263, "kl": 0.000871806318173185, "learning_rate": 6.673333333333334e-07, "loss": 0.0, "num_tokens": 2354937.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 148.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.006255480460822582, "kl": 0.0004456430615391582, "learning_rate": 6.67e-07, "loss": 0.0, "num_tokens": 2355197.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 148.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.032542139291763306, "kl": 0.027465634047985077, "learning_rate": 6.666666666666666e-07, "loss": 0.0014, "num_tokens": 2355563.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 148.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05432083085179329, "kl": 0.013635757379233837, "learning_rate": 6.663333333333334e-07, "loss": 0.0007, "num_tokens": 2355899.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 148.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.18009230494499207, "kl": 0.017343849409371614, "learning_rate": 6.660000000000001e-07, "loss": 0.001, "num_tokens": 2356170.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.035313963890075684, "kl": 0.01275394344702363, "learning_rate": 6.656666666666666e-07, "loss": 0.0007, "num_tokens": 2356444.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 148.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03571133315563202, "kl": 0.003046083264052868, "learning_rate": 6.653333333333333e-07, "loss": 0.0002, "num_tokens": 2356756.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 148.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0070494115352630615, "kl": 0.0003920156304957345, "learning_rate": 6.650000000000001e-07, "loss": 0.0, "num_tokens": 2357075.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 148.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02196187898516655, "kl": 0.0010163102997466922, "learning_rate": 6.646666666666667e-07, "loss": 0.0001, "num_tokens": 2357355.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 148.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.021372133865952492, "kl": 0.011771119199693203, "learning_rate": 6.643333333333333e-07, "loss": 0.0006, "num_tokens": 2357615.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 148.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.7622270584106445, "kl": 0.12751147523522377, "learning_rate": 6.64e-07, "loss": 0.0067, "num_tokens": 2357907.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 148.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.028814680874347687, "kl": 0.006982567410886986, "learning_rate": 6.636666666666667e-07, "loss": 0.0003, "num_tokens": 2358179.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10555873811244965, "kl": 0.0092735611833632, "learning_rate": 6.633333333333334e-07, "loss": 0.0005, "num_tokens": 2358479.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 148.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.034490421414375305, "kl": 0.009673627093434334, "learning_rate": 6.63e-07, "loss": 0.0005, "num_tokens": 2358782.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02914639189839363, "kl": 0.0002091825008392334, "learning_rate": 6.626666666666666e-07, "loss": 0.0, "num_tokens": 2358994.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 148.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.010764111764729023, "kl": 0.006849497323855758, "learning_rate": 6.623333333333333e-07, "loss": 0.0003, "num_tokens": 2359286.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 148.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.009475374594330788, "kl": 0.09832324832677841, "learning_rate": 6.620000000000001e-07, "loss": 0.0049, "num_tokens": 2359658.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 148.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.027653368189930916, "kl": 0.0018219202756881714, "learning_rate": 6.616666666666666e-07, "loss": 0.0001, "num_tokens": 2359870.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.17658135294914246, "kl": 0.011822124011814594, "learning_rate": 6.613333333333333e-07, "loss": 0.0007, "num_tokens": 2360089.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 148.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.11850176751613617, "kl": 0.015849125338718295, "learning_rate": 6.61e-07, "loss": 0.0008, "num_tokens": 2360413.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 148.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.222862482070923, "kl": 0.07072535157203674, "learning_rate": 6.606666666666667e-07, "loss": -0.025, "num_tokens": 2360743.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 148.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10633479803800583, "kl": 0.016889100894331932, "learning_rate": 6.603333333333334e-07, "loss": 0.0007, "num_tokens": 2361068.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.8403096199035645, "kl": 0.10900132835377008, "learning_rate": 6.6e-07, "loss": 0.2219, "num_tokens": 2361369.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 8021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 148.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.04380648210644722, "kl": 0.1606292575597763, "learning_rate": 6.596666666666667e-07, "loss": 0.008, "num_tokens": 2361680.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 148.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.019948706030845642, "kl": 0.00022693723440170288, "learning_rate": 6.593333333333333e-07, "loss": 0.0, "num_tokens": 2361888.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 148.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 4.14555025100708, "kl": 0.10182899609208107, "learning_rate": 6.590000000000001e-07, "loss": 0.2441, "num_tokens": 2362198.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 8024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 148.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.7454323768615723, "kl": 0.015445174183696508, "learning_rate": 6.586666666666667e-07, "loss": -0.0016, "num_tokens": 2362532.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 8025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04945673793554306, "kl": 0.001501104183262214, "learning_rate": 6.583333333333333e-07, "loss": 0.0001, "num_tokens": 2362751.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05545986443758011, "kl": 0.001404590904712677, "learning_rate": 6.58e-07, "loss": 0.0001, "num_tokens": 2362971.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.08915232867002487, "kl": 0.014578057453036308, "learning_rate": 6.576666666666668e-07, "loss": 0.0007, "num_tokens": 2363249.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 148.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.056035757064819336, "kl": 0.04754984565079212, "learning_rate": 6.573333333333333e-07, "loss": 0.0024, "num_tokens": 2363658.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 82.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 148.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.023529052734375, "kl": 0.027549312449991703, "learning_rate": 6.57e-07, "loss": 0.418, "num_tokens": 2364206.0, "reward": 3.049999952316284, "reward_std": 1.899999976158142, "rewards/reward_combined/mean": 3.049999952316284, "rewards/reward_combined/std": 1.899999976158142, "step": 8030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 148.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.026580994948744774, "kl": 0.00195881724357605, "learning_rate": 6.566666666666667e-07, "loss": 0.0001, "num_tokens": 2364466.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 148.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.019084108993411064, "kl": 0.0006825370655860752, "learning_rate": 6.563333333333333e-07, "loss": 0.0, "num_tokens": 2364702.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.010667764581739902, "kl": 0.0013720928691327572, "learning_rate": 6.56e-07, "loss": 0.0001, "num_tokens": 2364972.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 148.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.053804799914360046, "kl": 0.026516889221966267, "learning_rate": 6.556666666666667e-07, "loss": 0.0014, "num_tokens": 2365261.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 148.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.017637724056839943, "kl": 0.26560617983341217, "learning_rate": 6.553333333333333e-07, "loss": 0.0133, "num_tokens": 2365565.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 148.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.3374927043914795, "kl": 0.010917457402683794, "learning_rate": 6.55e-07, "loss": -0.0376, "num_tokens": 2365867.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8036 }, { "clip_ratio/high_max": 0.007936508394777775, "clip_ratio/high_mean": 0.007936508394777775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007936508394777775, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 148.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.1461198329925537, "kl": 0.2187935635447502, "learning_rate": 6.546666666666668e-07, "loss": -0.0485, "num_tokens": 2366233.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 148.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.414330244064331, "kl": 0.14570897817611694, "learning_rate": 6.543333333333333e-07, "loss": -0.0005, "num_tokens": 2366578.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 148.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.015954479575157166, "kl": 0.00042009057870018296, "learning_rate": 6.54e-07, "loss": 0.0, "num_tokens": 2366887.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 148.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07917128503322601, "kl": 0.03889700584113598, "learning_rate": 6.536666666666667e-07, "loss": 0.0019, "num_tokens": 2367215.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 148.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.18024809658527374, "kl": 0.02083851397037506, "learning_rate": 6.533333333333334e-07, "loss": 0.0011, "num_tokens": 2367558.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.09989302605390549, "kl": 0.008720326703041792, "learning_rate": 6.53e-07, "loss": 0.0004, "num_tokens": 2367824.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 148.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.034171611070632935, "kl": 0.0009974651038646698, "learning_rate": 6.526666666666667e-07, "loss": 0.0, "num_tokens": 2368084.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 148.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.5776098370552063, "kl": 0.06068794883321971, "learning_rate": 6.523333333333333e-07, "loss": 0.0036, "num_tokens": 2368380.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 148.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02446579746901989, "kl": 0.0026450157165527344, "learning_rate": 6.52e-07, "loss": 0.0001, "num_tokens": 2368712.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 149.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.8118560314178467, "kl": 0.019750438630580902, "learning_rate": 6.516666666666667e-07, "loss": -0.0311, "num_tokens": 2369005.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 149.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07364393770694733, "kl": 0.0026061697863042355, "learning_rate": 6.513333333333333e-07, "loss": 0.0001, "num_tokens": 2369276.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 149.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.026624208316206932, "kl": 0.012482813559472561, "learning_rate": 6.51e-07, "loss": 0.0007, "num_tokens": 2369550.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 149.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.06158352643251419, "kl": 0.0038567623123526573, "learning_rate": 6.506666666666667e-07, "loss": 0.0002, "num_tokens": 2369815.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 149.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.1643344610929489, "kl": 0.025559797883033752, "learning_rate": 6.503333333333333e-07, "loss": 0.0014, "num_tokens": 2370094.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 149.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.012544273398816586, "kl": 0.0007855668663978577, "learning_rate": 6.5e-07, "loss": 0.0, "num_tokens": 2370302.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 149.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.010793453082442284, "kl": 0.0006618913030251861, "learning_rate": 6.496666666666667e-07, "loss": 0.0, "num_tokens": 2370624.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 149.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.09990867972373962, "kl": 0.2681703567504883, "learning_rate": 6.493333333333334e-07, "loss": 0.0134, "num_tokens": 2370928.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 149.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.013178651221096516, "kl": 0.00012609064651769586, "learning_rate": 6.49e-07, "loss": 0.0, "num_tokens": 2371184.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 149.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.04322998970746994, "kl": 0.04183654114603996, "learning_rate": 6.486666666666667e-07, "loss": 0.0021, "num_tokens": 2371588.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 149.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.003713405691087246, "kl": 0.0002658894009073265, "learning_rate": 6.483333333333334e-07, "loss": 0.0, "num_tokens": 2371831.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 149.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 1.5020883083343506, "kl": 0.23826029431074858, "learning_rate": 6.48e-07, "loss": 0.0126, "num_tokens": 2372122.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 149.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01584302820265293, "kl": 0.15759839117527008, "learning_rate": 6.476666666666667e-07, "loss": 0.0079, "num_tokens": 2372433.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 149.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0945601835846901, "kl": 0.022564067505300045, "learning_rate": 6.473333333333334e-07, "loss": 0.0011, "num_tokens": 2372740.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 149.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.1706409454345703, "kl": 0.010000812355428934, "learning_rate": 6.47e-07, "loss": -0.0933, "num_tokens": 2373018.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 8060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 149.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.026172997429966927, "kl": 0.0016213970957323909, "learning_rate": 6.466666666666667e-07, "loss": 0.0001, "num_tokens": 2373290.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 149.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.560152053833008, "kl": 0.05466253496706486, "learning_rate": 6.463333333333334e-07, "loss": 0.0258, "num_tokens": 2373676.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 149.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.016302743926644325, "kl": 0.0007113851606845856, "learning_rate": 6.459999999999999e-07, "loss": 0.0, "num_tokens": 2373968.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 149.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.025030776858329773, "kl": 0.012033788254484534, "learning_rate": 6.456666666666667e-07, "loss": 0.0005, "num_tokens": 2374324.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 149.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.031675782054662704, "kl": 0.003703831462189555, "learning_rate": 6.453333333333334e-07, "loss": 0.0002, "num_tokens": 2374656.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 149.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.026614150032401085, "kl": 0.0017110109329223633, "learning_rate": 6.45e-07, "loss": 0.0001, "num_tokens": 2374868.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 149.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003367283206898719, "kl": 7.336835187743418e-05, "learning_rate": 6.446666666666666e-07, "loss": 0.0, "num_tokens": 2375180.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 149.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.027076590806245804, "kl": 0.0008286722004413605, "learning_rate": 6.443333333333334e-07, "loss": 0.0, "num_tokens": 2375440.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 149.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03575679659843445, "kl": 0.008242711424827576, "learning_rate": 6.44e-07, "loss": 0.0004, "num_tokens": 2375792.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 149.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03205917775630951, "kl": 0.00024272501468658447, "learning_rate": 6.436666666666667e-07, "loss": 0.0, "num_tokens": 2376004.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 149.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12120383232831955, "kl": 0.02965997252613306, "learning_rate": 6.433333333333334e-07, "loss": 0.0014, "num_tokens": 2376328.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 149.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010393207194283605, "kl": 0.001351326471194625, "learning_rate": 6.43e-07, "loss": 0.0001, "num_tokens": 2376605.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 149.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03497472032904625, "kl": 0.05707773193717003, "learning_rate": 6.426666666666667e-07, "loss": 0.0029, "num_tokens": 2376986.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 149.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09369508177042007, "kl": 0.005586998537182808, "learning_rate": 6.423333333333334e-07, "loss": 0.0003, "num_tokens": 2377247.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 149.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.6338117122650146, "kl": 0.14234613627195358, "learning_rate": 6.42e-07, "loss": -0.0622, "num_tokens": 2377600.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 149.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.020768476650118828, "kl": 0.0062638719828100875, "learning_rate": 6.416666666666666e-07, "loss": 0.0003, "num_tokens": 2377872.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 149.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 8.97605037689209, "kl": 1.336567960679531, "learning_rate": 6.413333333333334e-07, "loss": 0.164, "num_tokens": 2378110.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 149.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02859625220298767, "kl": 0.10031475871801376, "learning_rate": 6.41e-07, "loss": 0.005, "num_tokens": 2378482.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 149.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.009700460359454155, "kl": 0.0005378978530643508, "learning_rate": 6.406666666666667e-07, "loss": 0.0, "num_tokens": 2378717.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 149.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.086411952972412, "kl": 0.04046951234340668, "learning_rate": 6.403333333333333e-07, "loss": 0.0106, "num_tokens": 2379073.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 149.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.591157913208008, "kl": 0.07929861824959517, "learning_rate": 6.4e-07, "loss": -0.0576, "num_tokens": 2379353.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 8081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 149.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.724581003189087, "kl": 0.03500372124835849, "learning_rate": 6.396666666666667e-07, "loss": -0.0317, "num_tokens": 2379653.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 149.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.036604177206754684, "kl": 0.005146813113242388, "learning_rate": 6.393333333333334e-07, "loss": 0.0003, "num_tokens": 2379989.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 149.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.546838402748108, "kl": 0.09196072816848755, "learning_rate": 6.39e-07, "loss": -0.0941, "num_tokens": 2380330.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 149.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02282535843551159, "kl": 0.011365019716322422, "learning_rate": 6.386666666666666e-07, "loss": 0.0006, "num_tokens": 2380590.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 149.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0218175258487463, "kl": 0.0015938090509735048, "learning_rate": 6.383333333333334e-07, "loss": 0.0001, "num_tokens": 2380813.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 149.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.014930716715753078, "kl": 0.006839309353381395, "learning_rate": 6.380000000000001e-07, "loss": 0.0003, "num_tokens": 2381102.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 149.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07219650596380234, "kl": 0.014264614321291447, "learning_rate": 6.376666666666666e-07, "loss": 0.0007, "num_tokens": 2381431.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 149.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06212437152862549, "kl": 0.011779449065215886, "learning_rate": 6.373333333333333e-07, "loss": 0.0006, "num_tokens": 2381713.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 149.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06987587362527847, "kl": 0.004480735864490271, "learning_rate": 6.370000000000001e-07, "loss": 0.0002, "num_tokens": 2381985.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 149.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03254120796918869, "kl": 0.007599015021696687, "learning_rate": 6.366666666666667e-07, "loss": 0.0004, "num_tokens": 2382278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 149.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763347774744034, "kl": 0.04218504764139652, "learning_rate": 6.363333333333334e-07, "loss": 0.0022, "num_tokens": 2382568.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 149.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002058211830444634, "kl": 4.693865776062012e-06, "learning_rate": 6.36e-07, "loss": 0.0, "num_tokens": 2382788.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 149.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.16519120335578918, "kl": 0.03383318521082401, "learning_rate": 6.356666666666666e-07, "loss": 0.0016, "num_tokens": 2383112.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 149.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.19964449107646942, "kl": 0.011453303974121809, "learning_rate": 6.353333333333334e-07, "loss": 0.0006, "num_tokens": 2383351.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 149.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.00947615411132574, "kl": 0.0016359619330614805, "learning_rate": 6.350000000000001e-07, "loss": 0.0001, "num_tokens": 2383647.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 149.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0793096050620079, "kl": 0.012051485944539309, "learning_rate": 6.346666666666666e-07, "loss": 0.0006, "num_tokens": 2383927.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 149.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.2394258677959442, "kl": 0.03626616485416889, "learning_rate": 6.343333333333333e-07, "loss": 0.0016, "num_tokens": 2384244.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 149.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.014735878445208073, "kl": 0.0004193728236714378, "learning_rate": 6.340000000000001e-07, "loss": 0.0, "num_tokens": 2384506.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 150.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.02108786255121231, "kl": 0.0026528770104050636, "learning_rate": 6.336666666666667e-07, "loss": 0.0001, "num_tokens": 2384818.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 150.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07298009097576141, "kl": 0.041613128036260605, "learning_rate": 6.333333333333333e-07, "loss": 0.0021, "num_tokens": 2385116.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 150.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04171275720000267, "kl": 0.0011490675387904048, "learning_rate": 6.33e-07, "loss": 0.0001, "num_tokens": 2385373.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 150.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.4373611807823181, "kl": 0.03042233525775373, "learning_rate": 6.326666666666667e-07, "loss": 0.0016, "num_tokens": 2385643.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 150.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.1503992080688477, "kl": 0.0628511905670166, "learning_rate": 6.323333333333334e-07, "loss": 0.2016, "num_tokens": 2386031.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 8104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 150.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.06314212083816528, "kl": 0.007682716008275747, "learning_rate": 6.32e-07, "loss": 0.0005, "num_tokens": 2386298.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 150.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036239244509488344, "kl": 0.0002377443015575409, "learning_rate": 6.316666666666666e-07, "loss": 0.0, "num_tokens": 2386541.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.09184882044792175, "kl": 0.014567125719622709, "learning_rate": 6.313333333333333e-07, "loss": 0.0006, "num_tokens": 2386862.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 150.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1954093873500824, "kl": 0.04638131766114384, "learning_rate": 6.310000000000001e-07, "loss": 0.0026, "num_tokens": 2387161.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 150.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.02628750540316105, "kl": 0.001577138900756836, "learning_rate": 6.306666666666666e-07, "loss": 0.0001, "num_tokens": 2387373.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 150.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.004877230618149042, "kl": 0.00024380088143516332, "learning_rate": 6.303333333333333e-07, "loss": 0.0, "num_tokens": 2387593.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 150.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.016999825835227966, "kl": 0.0036536535844788887, "learning_rate": 6.3e-07, "loss": 0.0002, "num_tokens": 2387853.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 150.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 7.579219341278076, "kl": 0.029781543475110084, "learning_rate": 6.296666666666667e-07, "loss": 0.1949, "num_tokens": 2388126.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0728498324751854, "kl": 0.018589243292808533, "learning_rate": 6.293333333333334e-07, "loss": 0.0009, "num_tokens": 2388398.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 150.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.9928205013275146, "kl": 0.024965515360236168, "learning_rate": 6.29e-07, "loss": 0.1113, "num_tokens": 2388741.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 150.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.017191950231790543, "kl": 0.04064549319446087, "learning_rate": 6.286666666666667e-07, "loss": 0.002, "num_tokens": 2389146.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 150.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.48533296585083, "kl": 0.03565186820924282, "learning_rate": 6.283333333333333e-07, "loss": 0.0324, "num_tokens": 2389447.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 150.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05955472216010094, "kl": 0.020073309540748596, "learning_rate": 6.280000000000001e-07, "loss": 0.001, "num_tokens": 2389751.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 150.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.033035438507795334, "kl": 0.01324456837028265, "learning_rate": 6.276666666666667e-07, "loss": 0.0007, "num_tokens": 2390055.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 150.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.610957145690918, "kl": 1.340236946940422, "learning_rate": 6.273333333333333e-07, "loss": 0.0463, "num_tokens": 2390425.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 8119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 150.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.10300295054912567, "kl": 0.012131822062656283, "learning_rate": 6.27e-07, "loss": 0.0006, "num_tokens": 2390727.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 150.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.17439605295658112, "kl": 0.06616406515240669, "learning_rate": 6.266666666666668e-07, "loss": 0.0033, "num_tokens": 2391094.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 150.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.015556970611214638, "kl": 0.006047853392374236, "learning_rate": 6.263333333333333e-07, "loss": 0.0003, "num_tokens": 2391362.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 150.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.025696801021695137, "kl": 0.012086418457329273, "learning_rate": 6.26e-07, "loss": 0.0007, "num_tokens": 2391636.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005189818912185729, "kl": 0.0012384653673507273, "learning_rate": 6.256666666666667e-07, "loss": 0.0001, "num_tokens": 2391916.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 150.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02235209196805954, "kl": 0.011526144109666348, "learning_rate": 6.253333333333333e-07, "loss": 0.0006, "num_tokens": 2392176.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02855044975876808, "kl": 0.009233035147190094, "learning_rate": 6.25e-07, "loss": 0.0005, "num_tokens": 2392467.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 150.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.018995990976691246, "kl": 0.2653230279684067, "learning_rate": 6.246666666666667e-07, "loss": 0.0133, "num_tokens": 2392771.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 150.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.026809118688106537, "kl": 0.0009299021621700376, "learning_rate": 6.243333333333333e-07, "loss": 0.0, "num_tokens": 2393087.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 150.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077546448446810246, "kl": 0.009261199971660972, "learning_rate": 6.24e-07, "loss": 0.0004, "num_tokens": 2393373.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 150.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.06505013257265091, "kl": 0.058848634362220764, "learning_rate": 6.236666666666668e-07, "loss": 0.0029, "num_tokens": 2393749.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 150.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.027938757091760635, "kl": 0.000877678394317627, "learning_rate": 6.233333333333333e-07, "loss": 0.0, "num_tokens": 2394009.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 150.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03115653619170189, "kl": 0.032979780808091164, "learning_rate": 6.23e-07, "loss": 0.0017, "num_tokens": 2394376.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 150.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05155746638774872, "kl": 0.00849071890115738, "learning_rate": 6.226666666666667e-07, "loss": 0.0004, "num_tokens": 2394658.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 150.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.026812391355633736, "kl": 0.0014806622930336744, "learning_rate": 6.223333333333334e-07, "loss": 0.0001, "num_tokens": 2394938.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 150.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.3631722927093506, "kl": 0.016186361317522824, "learning_rate": 6.22e-07, "loss": 0.2206, "num_tokens": 2395312.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 150.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.006051879376173019, "kl": 0.1613881066441536, "learning_rate": 6.216666666666667e-07, "loss": 0.0081, "num_tokens": 2395621.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 150.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.970693588256836, "kl": 0.017016710247844458, "learning_rate": 6.213333333333333e-07, "loss": 0.1097, "num_tokens": 2395980.0, "reward": 5.25, "reward_std": 5.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 5.5, "step": 8137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 150.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 1.0967727899551392, "kl": 0.14109086815733463, "learning_rate": 6.21e-07, "loss": 0.0076, "num_tokens": 2396303.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 150.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.029974566772580147, "kl": 0.009257845114916563, "learning_rate": 6.206666666666667e-07, "loss": 0.0005, "num_tokens": 2396625.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 150.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.22037850320339203, "kl": 0.026107670506462455, "learning_rate": 6.203333333333333e-07, "loss": 0.0013, "num_tokens": 2396947.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 150.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.011543075554072857, "kl": 0.007443260634317994, "learning_rate": 6.2e-07, "loss": 0.0004, "num_tokens": 2397219.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 150.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023880486842244864, "kl": 1.6517937183380127e-05, "learning_rate": 6.196666666666667e-07, "loss": 0.0, "num_tokens": 2397431.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 150.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06675207614898682, "kl": 0.009099230170249939, "learning_rate": 6.193333333333333e-07, "loss": 0.0005, "num_tokens": 2397743.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 150.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0447884164750576, "kl": 0.0016662002162775025, "learning_rate": 6.19e-07, "loss": 0.0001, "num_tokens": 2397976.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 150.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01102589163929224, "kl": 0.0026854098541662097, "learning_rate": 6.186666666666667e-07, "loss": 0.0001, "num_tokens": 2398242.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 150.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00029954116325825453, "kl": 6.973743438720703e-06, "learning_rate": 6.183333333333334e-07, "loss": 0.0, "num_tokens": 2398462.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.021404866129159927, "kl": 0.002805741038173437, "learning_rate": 6.18e-07, "loss": 0.0001, "num_tokens": 2398752.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 150.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.00955558568239212, "kl": 0.0019166171550750732, "learning_rate": 6.176666666666667e-07, "loss": 0.0001, "num_tokens": 2398968.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 150.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.007257889024913311, "kl": 0.0014499574899673462, "learning_rate": 6.173333333333334e-07, "loss": 0.0001, "num_tokens": 2399264.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 150.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.029709329828619957, "kl": 0.0026041120290756226, "learning_rate": 6.17e-07, "loss": 0.0001, "num_tokens": 2399538.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 150.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.09406719356775284, "kl": 0.03556140046566725, "learning_rate": 6.166666666666667e-07, "loss": 0.0017, "num_tokens": 2399884.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 150.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.034665584564208984, "kl": 0.0019402316538617015, "learning_rate": 6.163333333333334e-07, "loss": 0.0001, "num_tokens": 2400156.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 150.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009009314235299826, "kl": 0.003742985427379608, "learning_rate": 6.16e-07, "loss": 0.0002, "num_tokens": 2400392.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 151.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03838036581873894, "kl": 0.0008770450949668884, "learning_rate": 6.156666666666667e-07, "loss": 0.0, "num_tokens": 2400604.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 151.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00961221195757389, "kl": 0.0018416047096252441, "learning_rate": 6.153333333333334e-07, "loss": 0.0001, "num_tokens": 2400820.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 151.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01988982781767845, "kl": 0.26517385244369507, "learning_rate": 6.149999999999999e-07, "loss": 0.0133, "num_tokens": 2401124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 151.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.031023859977722168, "kl": 0.004424812505021691, "learning_rate": 6.146666666666667e-07, "loss": 0.0002, "num_tokens": 2401423.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 151.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.12773260474205017, "kl": 0.06399892829358578, "learning_rate": 6.143333333333334e-07, "loss": 0.0032, "num_tokens": 2401803.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 151.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04989417642354965, "kl": 0.001959426503162831, "learning_rate": 6.14e-07, "loss": 0.0001, "num_tokens": 2402067.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 151.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021941643208265305, "kl": 0.0004234723746776581, "learning_rate": 6.136666666666666e-07, "loss": 0.0, "num_tokens": 2402327.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 151.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.023062046617269516, "kl": 0.006290606688708067, "learning_rate": 6.133333333333334e-07, "loss": 0.0003, "num_tokens": 2402595.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 151.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00020509045862127095, "kl": 4.723668098449707e-06, "learning_rate": 6.13e-07, "loss": 0.0, "num_tokens": 2402815.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 151.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.3017526865005493, "kl": 0.02648480422794819, "learning_rate": 6.126666666666667e-07, "loss": 0.0159, "num_tokens": 2403105.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 151.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05675193667411804, "kl": 0.01713305152952671, "learning_rate": 6.123333333333334e-07, "loss": 0.0009, "num_tokens": 2403393.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 151.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.6757495403289795, "kl": 0.019472898915410042, "learning_rate": 6.12e-07, "loss": 0.1566, "num_tokens": 2403740.0, "reward": 5.625, "reward_std": 4.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 4.75, "step": 8165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 151.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014792782254517078, "kl": 0.0007511275762226433, "learning_rate": 6.116666666666667e-07, "loss": 0.0, "num_tokens": 2404060.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 151.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.2755134105682373, "kl": 0.0266859628027305, "learning_rate": 6.113333333333334e-07, "loss": 0.0015, "num_tokens": 2404360.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 151.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.3069378435611725, "kl": 0.027982468833215535, "learning_rate": 6.11e-07, "loss": 0.0016, "num_tokens": 2404655.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 151.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.004815730266273022, "kl": 0.0002308189868927002, "learning_rate": 6.106666666666666e-07, "loss": 0.0, "num_tokens": 2404875.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 151.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.221113681793213, "kl": 0.059925079345703125, "learning_rate": 6.103333333333334e-07, "loss": 0.0693, "num_tokens": 2405174.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 151.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.526832103729248, "kl": 0.2515314519405365, "learning_rate": 6.1e-07, "loss": -0.0296, "num_tokens": 2405578.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 8171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 151.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0165316890925169, "kl": 0.0011796177714131773, "learning_rate": 6.096666666666667e-07, "loss": 0.0001, "num_tokens": 2405854.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 151.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.15650227665901184, "kl": 0.03975836560130119, "learning_rate": 6.093333333333333e-07, "loss": 0.002, "num_tokens": 2406152.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 151.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.019037781283259392, "kl": 0.0006773397326469421, "learning_rate": 6.09e-07, "loss": 0.0, "num_tokens": 2406360.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 151.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04480242729187012, "kl": 0.09906241297721863, "learning_rate": 6.086666666666667e-07, "loss": 0.005, "num_tokens": 2406732.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 151.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019578933715820312, "kl": 9.936392598319799e-05, "learning_rate": 6.083333333333334e-07, "loss": 0.0, "num_tokens": 2406988.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 151.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.008699082769453526, "kl": 0.0008562388538848609, "learning_rate": 6.08e-07, "loss": 0.0, "num_tokens": 2407270.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 151.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009310757159255445, "kl": 0.0037299245595932007, "learning_rate": 6.076666666666666e-07, "loss": 0.0002, "num_tokens": 2407506.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 151.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05726265162229538, "kl": 0.0031598604982718825, "learning_rate": 6.073333333333334e-07, "loss": 0.0002, "num_tokens": 2407780.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 151.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.026106981560587883, "kl": 0.00911693787202239, "learning_rate": 6.070000000000001e-07, "loss": 0.0005, "num_tokens": 2408107.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 151.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.06957307457923889, "kl": 0.011702904012054205, "learning_rate": 6.066666666666666e-07, "loss": 0.0006, "num_tokens": 2408381.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 151.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.027237365022301674, "kl": 0.00184708833694458, "learning_rate": 6.063333333333333e-07, "loss": 0.0001, "num_tokens": 2408593.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 71.5, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 71.5, "completions/mean_terminated_length": 71.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 151.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.41719388961792, "kl": 0.04268419893924147, "learning_rate": 6.060000000000001e-07, "loss": 0.4343, "num_tokens": 2409099.0, "reward": 6.625, "reward_std": 1.75, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 1.75, "step": 8183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 151.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.12419752776622772, "kl": 0.016777854412794113, "learning_rate": 6.056666666666667e-07, "loss": 0.0008, "num_tokens": 2409401.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 151.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.006372503004968166, "kl": 0.00023161139688454568, "learning_rate": 6.053333333333334e-07, "loss": 0.0, "num_tokens": 2409715.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 151.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0385553315281868, "kl": 0.006798181275371462, "learning_rate": 6.05e-07, "loss": 0.0003, "num_tokens": 2410029.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 151.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.09173041582107544, "kl": 0.009674896486103535, "learning_rate": 6.046666666666666e-07, "loss": 0.0005, "num_tokens": 2410363.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 151.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.562796592712402, "kl": 0.11481832526624203, "learning_rate": 6.043333333333334e-07, "loss": -0.0591, "num_tokens": 2410724.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.25, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 151.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.9219738245010376, "kl": 0.10625160112977028, "learning_rate": 6.040000000000001e-07, "loss": 0.178, "num_tokens": 2411185.0, "reward": 4.25, "reward_std": 4.092676162719727, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 4.092676162719727, "step": 8189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 151.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.023659836500883102, "kl": 0.0008029512246139348, "learning_rate": 6.036666666666666e-07, "loss": 0.0, "num_tokens": 2411420.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 151.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02845223993062973, "kl": 0.011919802287593484, "learning_rate": 6.033333333333333e-07, "loss": 0.0007, "num_tokens": 2411694.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 151.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.25670185685157776, "kl": 0.039418669417500496, "learning_rate": 6.030000000000001e-07, "loss": 0.0022, "num_tokens": 2411960.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 151.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 8.40407943725586, "kl": 0.036688029766082764, "learning_rate": 6.026666666666667e-07, "loss": 0.1726, "num_tokens": 2412230.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 151.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.004988313186913729, "kl": 0.0003746040165424347, "learning_rate": 6.023333333333333e-07, "loss": 0.0, "num_tokens": 2412474.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 151.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.00045529688941314816, "kl": 0.0012374690850265324, "learning_rate": 6.02e-07, "loss": 0.0001, "num_tokens": 2412754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 151.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.08678068965673447, "kl": 0.03031645342707634, "learning_rate": 6.016666666666667e-07, "loss": 0.0016, "num_tokens": 2413042.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 151.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06964217126369476, "kl": 0.005343659780919552, "learning_rate": 6.013333333333334e-07, "loss": 0.0003, "num_tokens": 2413319.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 151.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07598478347063065, "kl": 0.01780601590871811, "learning_rate": 6.01e-07, "loss": 0.0008, "num_tokens": 2413621.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 151.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 4.009413719177246, "kl": 0.1728421449661255, "learning_rate": 6.006666666666666e-07, "loss": 0.0602, "num_tokens": 2413935.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 151.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.027501234784722328, "kl": 0.0027347643626853824, "learning_rate": 6.003333333333333e-07, "loss": 0.0001, "num_tokens": 2414263.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 151.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.05053351819515228, "kl": 0.0061572156846523285, "learning_rate": 6.000000000000001e-07, "loss": 0.0003, "num_tokens": 2414575.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 151.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006172960624098778, "kl": 0.00041054486064240336, "learning_rate": 5.996666666666666e-07, "loss": 0.0, "num_tokens": 2414835.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 151.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.17876817286014557, "kl": 0.043079666793346405, "learning_rate": 5.993333333333333e-07, "loss": 0.0022, "num_tokens": 2415176.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 151.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08830154687166214, "kl": 0.033968967385590076, "learning_rate": 5.99e-07, "loss": 0.0019, "num_tokens": 2415498.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 151.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 4.388647556304932, "kl": 0.06377786211669445, "learning_rate": 5.986666666666667e-07, "loss": 0.059, "num_tokens": 2415806.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 151.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.054406873881816864, "kl": 0.02034620102494955, "learning_rate": 5.983333333333334e-07, "loss": 0.001, "num_tokens": 2416170.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 151.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.001832791487686336, "kl": 9.834766387939453e-06, "learning_rate": 5.98e-07, "loss": 0.0, "num_tokens": 2416382.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08657736331224442, "kl": 0.017484460957348347, "learning_rate": 5.976666666666667e-07, "loss": 0.001, "num_tokens": 2416666.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 152.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.033489666879177094, "kl": 0.004563005641102791, "learning_rate": 5.973333333333333e-07, "loss": 0.0002, "num_tokens": 2416978.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 152.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.006274409592151642, "kl": 0.0003683716058731079, "learning_rate": 5.970000000000001e-07, "loss": 0.0, "num_tokens": 2417238.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 152.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.012485303916037083, "kl": 0.0005397619243012741, "learning_rate": 5.966666666666667e-07, "loss": 0.0, "num_tokens": 2417557.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 152.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.014974136836826801, "kl": 0.002691819565370679, "learning_rate": 5.963333333333333e-07, "loss": 0.0001, "num_tokens": 2417846.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008965237648226321, "kl": 0.0037411153316497803, "learning_rate": 5.96e-07, "loss": 0.0002, "num_tokens": 2418082.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.020208600908517838, "kl": 0.006959666614420712, "learning_rate": 5.956666666666668e-07, "loss": 0.0003, "num_tokens": 2418371.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 152.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0057055833749473095, "kl": 0.0008480790420435369, "learning_rate": 5.953333333333333e-07, "loss": 0.0, "num_tokens": 2418649.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 152.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.14198848605155945, "kl": 0.06454429216682911, "learning_rate": 5.95e-07, "loss": 0.0032, "num_tokens": 2419025.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 152.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.034001439809799194, "kl": 0.012066320516169071, "learning_rate": 5.946666666666667e-07, "loss": 0.0007, "num_tokens": 2419299.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 152.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06343141943216324, "kl": 0.0058200303465127945, "learning_rate": 5.943333333333333e-07, "loss": 0.0003, "num_tokens": 2419622.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 152.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.006403795909136534, "kl": 0.0002362173399887979, "learning_rate": 5.94e-07, "loss": 0.0, "num_tokens": 2419936.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 152.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029874155297875404, "kl": 5.364914613892324e-05, "learning_rate": 5.936666666666667e-07, "loss": 0.0, "num_tokens": 2420208.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.1439686119556427, "kl": 0.010446007363498211, "learning_rate": 5.933333333333333e-07, "loss": 0.0006, "num_tokens": 2420427.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 152.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.10712690651416779, "kl": 0.0130801722407341, "learning_rate": 5.93e-07, "loss": 0.0006, "num_tokens": 2420722.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 152.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.10352638363838196, "kl": 0.007192038930952549, "learning_rate": 5.926666666666668e-07, "loss": 0.0003, "num_tokens": 2420986.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 152.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.2082226276397705, "kl": 0.09929879754781723, "learning_rate": 5.923333333333333e-07, "loss": -0.1475, "num_tokens": 2421351.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 152.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.052132461220026016, "kl": 0.058430589735507965, "learning_rate": 5.92e-07, "loss": 0.0029, "num_tokens": 2421690.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 152.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.022532550618052483, "kl": 0.011368549428880215, "learning_rate": 5.916666666666667e-07, "loss": 0.0006, "num_tokens": 2421950.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 152.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05951330065727234, "kl": 0.008515564724802971, "learning_rate": 5.913333333333334e-07, "loss": 0.0004, "num_tokens": 2422250.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 4.093994617462158, "kl": 0.0455700708553195, "learning_rate": 5.91e-07, "loss": 0.1604, "num_tokens": 2422533.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 8228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 152.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.007217801176011562, "kl": 0.16149814426898956, "learning_rate": 5.906666666666667e-07, "loss": 0.0081, "num_tokens": 2422842.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 152.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0687694326043129, "kl": 0.003531071590259671, "learning_rate": 5.903333333333333e-07, "loss": 0.0002, "num_tokens": 2423049.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 152.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01360238716006279, "kl": 0.0017036875651683658, "learning_rate": 5.9e-07, "loss": 0.0001, "num_tokens": 2423319.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 152.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.009090722538530827, "kl": 0.00032445043325424194, "learning_rate": 5.896666666666667e-07, "loss": 0.0, "num_tokens": 2423563.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 152.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.007899472489953041, "kl": 0.00012859403796028346, "learning_rate": 5.893333333333333e-07, "loss": 0.0, "num_tokens": 2423819.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 152.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1338353157043457, "kl": 0.02521562296897173, "learning_rate": 5.89e-07, "loss": 0.0013, "num_tokens": 2424131.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 152.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.06179427355527878, "kl": 0.012030292768031359, "learning_rate": 5.886666666666667e-07, "loss": 0.0006, "num_tokens": 2424413.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.022431688383221626, "kl": 0.00026538968086242676, "learning_rate": 5.883333333333333e-07, "loss": 0.0, "num_tokens": 2424625.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 152.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.031857602298259735, "kl": 0.03791419789195061, "learning_rate": 5.88e-07, "loss": 0.002, "num_tokens": 2424985.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 152.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.2835279703140259, "kl": 0.0566001208499074, "learning_rate": 5.876666666666667e-07, "loss": 0.0031, "num_tokens": 2425307.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 152.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04293827712535858, "kl": 0.01038780459202826, "learning_rate": 5.873333333333334e-07, "loss": 0.0005, "num_tokens": 2425631.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 152.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.07900404930114746, "kl": 0.005145840812474489, "learning_rate": 5.87e-07, "loss": 0.0003, "num_tokens": 2425909.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 152.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006395588279701769, "kl": 0.0013253133511170745, "learning_rate": 5.866666666666667e-07, "loss": 0.0001, "num_tokens": 2426186.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.00032005488174036145, "kl": 8.001923561096191e-06, "learning_rate": 5.863333333333334e-07, "loss": 0.0, "num_tokens": 2426406.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 152.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.15829156339168549, "kl": 0.03112404327839613, "learning_rate": 5.86e-07, "loss": 0.0016, "num_tokens": 2426743.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 152.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.022487297654151917, "kl": 0.00100295664742589, "learning_rate": 5.856666666666667e-07, "loss": 0.0001, "num_tokens": 2427011.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01829039677977562, "kl": 0.0015137278387555853, "learning_rate": 5.853333333333334e-07, "loss": 0.0001, "num_tokens": 2427232.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 152.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.026136793196201324, "kl": 0.001528024673461914, "learning_rate": 5.85e-07, "loss": 0.0001, "num_tokens": 2427444.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 152.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.028324615210294724, "kl": 0.09752213209867477, "learning_rate": 5.846666666666667e-07, "loss": 0.0049, "num_tokens": 2427816.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 152.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.006173395086079836, "kl": 0.00046771764755249023, "learning_rate": 5.843333333333334e-07, "loss": 0.0, "num_tokens": 2428076.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 152.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.9802799224853516, "kl": 0.1436002403497696, "learning_rate": 5.839999999999999e-07, "loss": 0.0393, "num_tokens": 2428480.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 8249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 152.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.334186553955078, "kl": 0.6243870556354523, "learning_rate": 5.836666666666667e-07, "loss": 0.0655, "num_tokens": 2428804.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 152.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.14654184877872467, "kl": 0.012031571473926306, "learning_rate": 5.833333333333334e-07, "loss": 0.0006, "num_tokens": 2429139.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 152.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01571676880121231, "kl": 0.0015681475342717022, "learning_rate": 5.83e-07, "loss": 0.0001, "num_tokens": 2429399.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.05559800565242767, "kl": 0.007426847703754902, "learning_rate": 5.826666666666666e-07, "loss": 0.0004, "num_tokens": 2429670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 152.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05603273585438728, "kl": 0.01806573662906885, "learning_rate": 5.823333333333334e-07, "loss": 0.0008, "num_tokens": 2429969.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 152.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.029086831957101822, "kl": 0.0017388327396474779, "learning_rate": 5.82e-07, "loss": 0.0001, "num_tokens": 2430265.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03411523252725601, "kl": 0.02471522707492113, "learning_rate": 5.816666666666667e-07, "loss": 0.0013, "num_tokens": 2430554.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 152.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.017528988420963287, "kl": 0.0006685789267066866, "learning_rate": 5.813333333333334e-07, "loss": 0.0, "num_tokens": 2430790.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 152.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.019602010026574135, "kl": 0.2652120590209961, "learning_rate": 5.81e-07, "loss": 0.0133, "num_tokens": 2431094.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 152.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.012098081409931183, "kl": 0.0028562715742737055, "learning_rate": 5.806666666666667e-07, "loss": 0.0001, "num_tokens": 2431385.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.006116027943789959, "kl": 0.0008660210878588259, "learning_rate": 5.803333333333334e-07, "loss": 0.0, "num_tokens": 2431669.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 152.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.41422438621521, "kl": 0.17736125737428665, "learning_rate": 5.8e-07, "loss": -0.114, "num_tokens": 2432010.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 153.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03262478858232498, "kl": 0.0066216300474479795, "learning_rate": 5.796666666666666e-07, "loss": 0.0003, "num_tokens": 2432343.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 153.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.715142726898193, "kl": 0.129999328404665, "learning_rate": 5.793333333333334e-07, "loss": 0.1642, "num_tokens": 2432666.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 8263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 153.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02258777618408203, "kl": 0.0113847223110497, "learning_rate": 5.79e-07, "loss": 0.0006, "num_tokens": 2432926.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 153.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 3.074158191680908, "kl": 0.2561429899651557, "learning_rate": 5.786666666666667e-07, "loss": 0.1084, "num_tokens": 2433204.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 8265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 153.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.02532315067946911, "kl": 0.0007674284279346466, "learning_rate": 5.783333333333333e-07, "loss": 0.0, "num_tokens": 2433464.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 153.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.01998990774154663, "kl": 0.0006948375084903091, "learning_rate": 5.78e-07, "loss": 0.0, "num_tokens": 2433685.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 153.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.562995195388794, "kl": 0.12784771621227264, "learning_rate": 5.776666666666667e-07, "loss": 0.0064, "num_tokens": 2434055.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 153.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.010777637362480164, "kl": 0.0026645335310604423, "learning_rate": 5.773333333333334e-07, "loss": 0.0001, "num_tokens": 2434321.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 153.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09361322969198227, "kl": 0.011937583331018686, "learning_rate": 5.77e-07, "loss": 0.0006, "num_tokens": 2434623.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 153.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.08961789309978485, "kl": 0.06746284291148186, "learning_rate": 5.766666666666666e-07, "loss": 0.0034, "num_tokens": 2435001.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 153.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.007029331289231777, "kl": 0.0008635367848910391, "learning_rate": 5.763333333333334e-07, "loss": 0.0, "num_tokens": 2435277.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 153.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.017782054841518402, "kl": 0.000344881416822318, "learning_rate": 5.760000000000001e-07, "loss": 0.0, "num_tokens": 2435533.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 153.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.015332967974245548, "kl": 0.0010843120398931205, "learning_rate": 5.756666666666666e-07, "loss": 0.0001, "num_tokens": 2435813.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 153.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04747886583209038, "kl": 0.005562614183872938, "learning_rate": 5.753333333333333e-07, "loss": 0.0003, "num_tokens": 2436099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 153.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.009058780036866665, "kl": 0.0003537870943546295, "learning_rate": 5.750000000000001e-07, "loss": 0.0, "num_tokens": 2436343.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 153.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09518574178218842, "kl": 0.0034202428651042283, "learning_rate": 5.746666666666667e-07, "loss": 0.0002, "num_tokens": 2436616.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 153.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022815570700913668, "kl": 4.509339669311885e-05, "learning_rate": 5.743333333333334e-07, "loss": 0.0, "num_tokens": 2436888.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 153.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.3772287368774414, "kl": 0.018303331453353167, "learning_rate": 5.74e-07, "loss": 0.0009, "num_tokens": 2437146.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 153.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03290873393416405, "kl": 0.04473191127181053, "learning_rate": 5.736666666666666e-07, "loss": 0.0022, "num_tokens": 2437550.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 153.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009710745071060956, "kl": 0.003725379705429077, "learning_rate": 5.733333333333334e-07, "loss": 0.0002, "num_tokens": 2437786.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 153.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.04793475568294525, "kl": 0.005475924350321293, "learning_rate": 5.730000000000001e-07, "loss": 0.0003, "num_tokens": 2438098.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 153.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.762014389038086, "kl": 0.048394862562417984, "learning_rate": 5.726666666666666e-07, "loss": 0.0175, "num_tokens": 2438463.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 153.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.00917720515280962, "kl": 0.15728817880153656, "learning_rate": 5.723333333333333e-07, "loss": 0.0078, "num_tokens": 2438774.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 153.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03727919980883598, "kl": 0.004796176450327039, "learning_rate": 5.720000000000001e-07, "loss": 0.0002, "num_tokens": 2439105.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 153.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.04206138476729393, "kl": 0.027701175771653652, "learning_rate": 5.716666666666667e-07, "loss": 0.0014, "num_tokens": 2439480.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 153.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07937157154083252, "kl": 0.012952920515090227, "learning_rate": 5.713333333333333e-07, "loss": 0.0007, "num_tokens": 2439808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 153.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002558810811024159, "kl": 5.826354026794434e-06, "learning_rate": 5.71e-07, "loss": 0.0, "num_tokens": 2440028.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 153.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.017458437010645866, "kl": 0.0009235720208380371, "learning_rate": 5.706666666666667e-07, "loss": 0.0, "num_tokens": 2440294.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 153.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09783709794282913, "kl": 0.016546230297535658, "learning_rate": 5.703333333333334e-07, "loss": 0.0008, "num_tokens": 2440621.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 153.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.016708340495824814, "kl": 0.00336581066949293, "learning_rate": 5.7e-07, "loss": 0.0002, "num_tokens": 2440913.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 153.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.09030290693044662, "kl": 0.018074131337925792, "learning_rate": 5.696666666666666e-07, "loss": 0.001, "num_tokens": 2441220.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 153.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.48011282086372375, "kl": 0.08293337374925613, "learning_rate": 5.693333333333333e-07, "loss": 0.0032, "num_tokens": 2441537.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 153.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.056277938187122345, "kl": 0.02923646569252014, "learning_rate": 5.690000000000001e-07, "loss": 0.0014, "num_tokens": 2441811.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 153.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03138343244791031, "kl": 0.001311879779677838, "learning_rate": 5.686666666666667e-07, "loss": 0.0001, "num_tokens": 2442045.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 153.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.328172206878662, "kl": 0.24690625071525574, "learning_rate": 5.683333333333333e-07, "loss": 0.1248, "num_tokens": 2442358.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 153.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00587368942797184, "kl": 0.000370153778931126, "learning_rate": 5.68e-07, "loss": 0.0, "num_tokens": 2442618.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 153.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.08754284679889679, "kl": 0.004288829397410154, "learning_rate": 5.676666666666667e-07, "loss": 0.0003, "num_tokens": 2442845.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 153.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04405666142702103, "kl": 0.002103633596561849, "learning_rate": 5.673333333333334e-07, "loss": 0.0001, "num_tokens": 2443143.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 153.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005244787898845971, "kl": 0.001316857582423836, "learning_rate": 5.67e-07, "loss": 0.0001, "num_tokens": 2443420.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 153.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.012456518597900867, "kl": 0.0008397191413678229, "learning_rate": 5.666666666666667e-07, "loss": 0.0, "num_tokens": 2443680.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 153.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.031383346766233444, "kl": 0.005854287534020841, "learning_rate": 5.663333333333333e-07, "loss": 0.0002, "num_tokens": 2443996.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 153.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751035213470459, "kl": 0.048893094062805176, "learning_rate": 5.660000000000001e-07, "loss": 0.0024, "num_tokens": 2444288.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 153.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01674734055995941, "kl": 0.0001826956868171692, "learning_rate": 5.656666666666667e-07, "loss": 0.0, "num_tokens": 2444498.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 153.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.026693180203437805, "kl": 0.0016957670450210571, "learning_rate": 5.653333333333333e-07, "loss": 0.0001, "num_tokens": 2444710.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 153.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.017887800931930542, "kl": 0.005099207162857056, "learning_rate": 5.65e-07, "loss": 0.0003, "num_tokens": 2444978.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.75, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 153.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0458885096013546, "kl": 0.01571763912215829, "learning_rate": 5.646666666666667e-07, "loss": 0.0006, "num_tokens": 2445365.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 153.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06181563809514046, "kl": 0.002845512703061104, "learning_rate": 5.643333333333333e-07, "loss": 0.0001, "num_tokens": 2445657.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 153.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03170020133256912, "kl": 0.00026510655879974365, "learning_rate": 5.64e-07, "loss": 0.0, "num_tokens": 2445869.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 153.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.031905390322208405, "kl": 0.002102102618664503, "learning_rate": 5.636666666666667e-07, "loss": 0.0001, "num_tokens": 2446192.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 153.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.09781618416309357, "kl": 0.03031076118350029, "learning_rate": 5.633333333333333e-07, "loss": 0.0014, "num_tokens": 2446538.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 153.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.018083926290273666, "kl": 0.0005412757454905659, "learning_rate": 5.63e-07, "loss": 0.0, "num_tokens": 2446854.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 153.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 1.9756832122802734, "kl": 0.061098862439394, "learning_rate": 5.626666666666667e-07, "loss": 0.0038, "num_tokens": 2447201.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 153.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.021097060292959213, "kl": 0.008475386537611485, "learning_rate": 5.623333333333333e-07, "loss": 0.0004, "num_tokens": 2447532.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 153.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.73818039894104, "kl": 0.04998205788433552, "learning_rate": 5.62e-07, "loss": 0.1493, "num_tokens": 2447849.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 154.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03913053125143051, "kl": 0.008828048594295979, "learning_rate": 5.616666666666668e-07, "loss": 0.0004, "num_tokens": 2448139.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 154.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.008888961747288704, "kl": 0.000478645961266011, "learning_rate": 5.613333333333333e-07, "loss": 0.0, "num_tokens": 2448458.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05503654107451439, "kl": 0.004644736181944609, "learning_rate": 5.61e-07, "loss": 0.0002, "num_tokens": 2448732.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 154.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.04165855422616005, "kl": 0.0015937138814479113, "learning_rate": 5.606666666666667e-07, "loss": 0.0001, "num_tokens": 2449006.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 154.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.006506198085844517, "kl": 0.00023494711422245018, "learning_rate": 5.603333333333334e-07, "loss": 0.0, "num_tokens": 2449320.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 154.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.000523120048455894, "kl": 1.858919858932495e-05, "learning_rate": 5.6e-07, "loss": 0.0, "num_tokens": 2449532.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 154.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004662947729229927, "kl": 0.00022652148618362844, "learning_rate": 5.596666666666667e-07, "loss": 0.0, "num_tokens": 2449752.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.030857738107442856, "kl": 0.001850803499110043, "learning_rate": 5.593333333333333e-07, "loss": 0.0001, "num_tokens": 2450050.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 154.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08106081187725067, "kl": 0.007502306718379259, "learning_rate": 5.59e-07, "loss": 0.0004, "num_tokens": 2450386.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 154.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.026218606159090996, "kl": 0.0015904158353805542, "learning_rate": 5.586666666666667e-07, "loss": 0.0001, "num_tokens": 2450598.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 154.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042798908543772995, "kl": 0.0012370496988296509, "learning_rate": 5.583333333333333e-07, "loss": 0.0001, "num_tokens": 2450878.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 154.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02619774080812931, "kl": 0.0007577687720186077, "learning_rate": 5.58e-07, "loss": 0.0, "num_tokens": 2451134.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 154.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.030264941975474358, "kl": 0.004409928224049509, "learning_rate": 5.576666666666667e-07, "loss": 0.0002, "num_tokens": 2451422.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 154.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.029210738837718964, "kl": 0.0733129046857357, "learning_rate": 5.573333333333333e-07, "loss": 0.0037, "num_tokens": 2451793.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 154.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.005117594730108976, "kl": 0.0003747120499610901, "learning_rate": 5.57e-07, "loss": 0.0, "num_tokens": 2452037.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 154.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03576855733990669, "kl": 0.00482076033949852, "learning_rate": 5.566666666666667e-07, "loss": 0.0002, "num_tokens": 2452364.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 154.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07462895661592484, "kl": 0.03889967314898968, "learning_rate": 5.563333333333334e-07, "loss": 0.002, "num_tokens": 2452745.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 154.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.018847031518816948, "kl": 0.26534825563430786, "learning_rate": 5.56e-07, "loss": 0.0133, "num_tokens": 2453049.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 154.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06223014369606972, "kl": 0.011949718929827213, "learning_rate": 5.556666666666667e-07, "loss": 0.0006, "num_tokens": 2453326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 154.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10442297905683517, "kl": 0.032682210206985474, "learning_rate": 5.553333333333334e-07, "loss": 0.0017, "num_tokens": 2453630.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 154.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 1.1021322011947632, "kl": 0.13737642765045166, "learning_rate": 5.55e-07, "loss": 0.0069, "num_tokens": 2453969.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 154.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.00566828390583396, "kl": 0.0003056225832551718, "learning_rate": 5.546666666666667e-07, "loss": 0.0, "num_tokens": 2454231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 154.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.006953234784305096, "kl": 0.16165469586849213, "learning_rate": 5.543333333333333e-07, "loss": 0.0081, "num_tokens": 2454540.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 154.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.634182929992676, "kl": 0.04869305342435837, "learning_rate": 5.54e-07, "loss": 0.147, "num_tokens": 2454883.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 154.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 3.85901141166687, "kl": 0.022627011872828007, "learning_rate": 5.536666666666667e-07, "loss": 0.1245, "num_tokens": 2455238.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 154.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.005726439878344536, "kl": 0.0005736123712267727, "learning_rate": 5.533333333333334e-07, "loss": 0.0, "num_tokens": 2455472.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01484580710530281, "kl": 0.0012317707878537476, "learning_rate": 5.529999999999999e-07, "loss": 0.0001, "num_tokens": 2455750.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 154.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03656516224145889, "kl": 0.012933549180161208, "learning_rate": 5.526666666666667e-07, "loss": 0.0007, "num_tokens": 2456037.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 154.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.8089420795440674, "kl": 0.044436972588300705, "learning_rate": 5.523333333333334e-07, "loss": 0.0991, "num_tokens": 2456375.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 8344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 154.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.845674991607666, "kl": 0.02177418302744627, "learning_rate": 5.52e-07, "loss": -0.0595, "num_tokens": 2456720.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 8345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 154.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.000318619393510744, "kl": 8.121132850646973e-06, "learning_rate": 5.516666666666666e-07, "loss": 0.0, "num_tokens": 2456940.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 154.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.005950066260993481, "kl": 0.0003937259316444397, "learning_rate": 5.513333333333334e-07, "loss": 0.0, "num_tokens": 2457200.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 154.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 3.636676073074341, "kl": 0.07898950390517712, "learning_rate": 5.51e-07, "loss": -0.0107, "num_tokens": 2457503.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 154.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.029922667890787125, "kl": 0.001224012579768896, "learning_rate": 5.506666666666667e-07, "loss": 0.0001, "num_tokens": 2457808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 154.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.007834872230887413, "kl": 0.0016319826245307922, "learning_rate": 5.503333333333334e-07, "loss": 0.0001, "num_tokens": 2458024.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 154.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.977325916290283, "kl": 0.013910597190260887, "learning_rate": 5.499999999999999e-07, "loss": -0.1529, "num_tokens": 2458292.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 8351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 154.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.13469654321670532, "kl": 0.021035901736468077, "learning_rate": 5.496666666666667e-07, "loss": 0.001, "num_tokens": 2458592.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 154.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04311174899339676, "kl": 0.006896013393998146, "learning_rate": 5.493333333333334e-07, "loss": 0.0003, "num_tokens": 2458870.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.020176585763692856, "kl": 0.00520420353859663, "learning_rate": 5.49e-07, "loss": 0.0003, "num_tokens": 2459138.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 154.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.060214847326278687, "kl": 0.005490035400725901, "learning_rate": 5.486666666666666e-07, "loss": 0.0003, "num_tokens": 2459396.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0613739900290966, "kl": 0.0031856546411290765, "learning_rate": 5.483333333333334e-07, "loss": 0.0002, "num_tokens": 2459666.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 154.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.011704935692250729, "kl": 0.007982588838785887, "learning_rate": 5.48e-07, "loss": 0.0004, "num_tokens": 2459938.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 154.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04813944175839424, "kl": 0.005820095539093018, "learning_rate": 5.476666666666667e-07, "loss": 0.0003, "num_tokens": 2460250.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 154.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10329499840736389, "kl": 0.05913754925131798, "learning_rate": 5.473333333333333e-07, "loss": 0.003, "num_tokens": 2460598.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 154.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004868348594754934, "kl": 0.00010152508912142366, "learning_rate": 5.47e-07, "loss": 0.0, "num_tokens": 2460868.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.017690882086753845, "kl": 0.0021029120252933353, "learning_rate": 5.466666666666667e-07, "loss": 0.0001, "num_tokens": 2461138.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 154.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04372517764568329, "kl": 0.0021331667667254806, "learning_rate": 5.463333333333334e-07, "loss": 0.0001, "num_tokens": 2461392.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 154.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.02498186193406582, "kl": 0.006004116032272577, "learning_rate": 5.46e-07, "loss": 0.0003, "num_tokens": 2461683.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 154.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04266771301627159, "kl": 0.008875719271600246, "learning_rate": 5.456666666666666e-07, "loss": 0.0004, "num_tokens": 2462013.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 154.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.024272803217172623, "kl": 0.04772520437836647, "learning_rate": 5.453333333333334e-07, "loss": 0.0024, "num_tokens": 2462417.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 154.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010209826286882162, "kl": 0.003725387156009674, "learning_rate": 5.450000000000001e-07, "loss": 0.0002, "num_tokens": 2462653.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 154.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.3208865225315094, "kl": 0.0930166020989418, "learning_rate": 5.446666666666666e-07, "loss": 0.0047, "num_tokens": 2463022.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 154.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07350413501262665, "kl": 0.03143562376499176, "learning_rate": 5.443333333333333e-07, "loss": 0.0016, "num_tokens": 2463356.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 154.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.007228239439427853, "kl": 0.0004351213574409485, "learning_rate": 5.44e-07, "loss": 0.0, "num_tokens": 2463564.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01315789483487606, "clip_ratio/low_min": 0.01315789483487606, "clip_ratio/region_mean": 0.01315789483487606, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 155.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.025207042694092, "kl": 0.02106407703831792, "learning_rate": 5.436666666666667e-07, "loss": -0.0038, "num_tokens": 2463856.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 155.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005544027779251337, "kl": 0.00030422210693359375, "learning_rate": 5.433333333333334e-07, "loss": 0.0, "num_tokens": 2464116.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 155.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09991784393787384, "kl": 0.041562771424651146, "learning_rate": 5.43e-07, "loss": 0.0021, "num_tokens": 2464411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 155.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 4.10620641708374, "kl": 0.11383737996220589, "learning_rate": 5.426666666666666e-07, "loss": 0.1118, "num_tokens": 2464709.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 155.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.12902311980724335, "kl": 0.052536096423864365, "learning_rate": 5.423333333333334e-07, "loss": 0.0027, "num_tokens": 2465094.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.00619744136929512, "kl": 0.0010310067445971072, "learning_rate": 5.420000000000001e-07, "loss": 0.0001, "num_tokens": 2465378.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.010383947752416134, "kl": 0.0004548997094389051, "learning_rate": 5.416666666666666e-07, "loss": 0.0, "num_tokens": 2465651.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8376 }, { "clip_ratio/high_max": 0.0058139534667134285, "clip_ratio/high_mean": 0.0058139534667134285, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0058139534667134285, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 155.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.265976905822754, "kl": 0.08938579261302948, "learning_rate": 5.413333333333333e-07, "loss": -0.0056, "num_tokens": 2466028.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 155.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08833947032690048, "kl": 0.010712239891290665, "learning_rate": 5.410000000000001e-07, "loss": 0.0005, "num_tokens": 2466272.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.019876381382346153, "kl": 0.005873143672943115, "learning_rate": 5.406666666666667e-07, "loss": 0.0003, "num_tokens": 2466554.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 155.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.058269817382097244, "kl": 0.00956787308678031, "learning_rate": 5.403333333333333e-07, "loss": 0.0005, "num_tokens": 2466878.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 155.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06682496517896652, "kl": 0.0031241700053215027, "learning_rate": 5.4e-07, "loss": 0.0002, "num_tokens": 2467094.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 155.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.11336594820022583, "kl": 0.038844432681798935, "learning_rate": 5.396666666666666e-07, "loss": 0.002, "num_tokens": 2467409.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 155.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.022847386077046394, "kl": 0.0005056319059804082, "learning_rate": 5.393333333333334e-07, "loss": 0.0, "num_tokens": 2467642.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.046104151755571365, "kl": 0.02171806525439024, "learning_rate": 5.39e-07, "loss": 0.0011, "num_tokens": 2467915.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 155.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.006710353307425976, "kl": 0.00022089167396188714, "learning_rate": 5.386666666666666e-07, "loss": 0.0, "num_tokens": 2468229.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.409780979156494, "kl": 0.021229079458862543, "learning_rate": 5.383333333333333e-07, "loss": -0.0524, "num_tokens": 2468518.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 155.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04751231148838997, "kl": 0.0008119717240333557, "learning_rate": 5.380000000000001e-07, "loss": 0.0, "num_tokens": 2468724.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 155.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048167286440730095, "kl": 0.00024247169494628906, "learning_rate": 5.376666666666667e-07, "loss": 0.0, "num_tokens": 2468944.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.011471696197986603, "kl": 0.007820246275514364, "learning_rate": 5.373333333333333e-07, "loss": 0.0004, "num_tokens": 2469216.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 155.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.031831782311201096, "kl": 0.012028172612190247, "learning_rate": 5.37e-07, "loss": 0.0007, "num_tokens": 2469490.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 155.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.00025179359363391995, "kl": 5.7891011238098145e-06, "learning_rate": 5.366666666666667e-07, "loss": 0.0, "num_tokens": 2469710.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 155.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.06769102066755295, "kl": 0.0086826549959369, "learning_rate": 5.363333333333334e-07, "loss": 0.0003, "num_tokens": 2470028.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 155.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 1.087205171585083, "kl": 0.22589676827192307, "learning_rate": 5.36e-07, "loss": 0.0115, "num_tokens": 2470402.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.01321250107139349, "kl": 0.011940506286919117, "learning_rate": 5.356666666666667e-07, "loss": 0.0006, "num_tokens": 2470682.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 155.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07551845908164978, "kl": 0.05878164991736412, "learning_rate": 5.353333333333333e-07, "loss": 0.0029, "num_tokens": 2471058.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 155.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.48009926080703735, "kl": 0.09998535551130772, "learning_rate": 5.350000000000001e-07, "loss": 0.0052, "num_tokens": 2471398.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 155.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011485875584185123, "kl": 0.0004405789077281952, "learning_rate": 5.346666666666667e-07, "loss": 0.0, "num_tokens": 2471658.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 155.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.15054261684417725, "kl": 0.018153753131628036, "learning_rate": 5.343333333333333e-07, "loss": 0.001, "num_tokens": 2471960.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 155.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.1454274654388428, "kl": 0.11350664414931089, "learning_rate": 5.34e-07, "loss": -0.0078, "num_tokens": 2472278.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 8399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 155.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.019624611362814903, "kl": 0.2651744931936264, "learning_rate": 5.336666666666667e-07, "loss": 0.0133, "num_tokens": 2472582.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 155.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.043150998651981354, "kl": 0.042211174964904785, "learning_rate": 5.333333333333333e-07, "loss": 0.0021, "num_tokens": 2472986.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 155.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 6.781017303466797, "kl": 0.08708360604941845, "learning_rate": 5.33e-07, "loss": 0.0482, "num_tokens": 2473306.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 155.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.014792831614613533, "kl": 0.003580319113098085, "learning_rate": 5.326666666666667e-07, "loss": 0.0002, "num_tokens": 2473574.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 155.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.1653987467288971, "kl": 0.011425634380429983, "learning_rate": 5.323333333333333e-07, "loss": 0.0006, "num_tokens": 2473899.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 155.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011270777322351933, "kl": 0.0037083476781845093, "learning_rate": 5.32e-07, "loss": 0.0002, "num_tokens": 2474135.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 155.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.14027844369411469, "kl": 0.027131887152791023, "learning_rate": 5.316666666666667e-07, "loss": 0.0013, "num_tokens": 2474440.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 155.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.275292158126831, "kl": 0.08845902606844902, "learning_rate": 5.313333333333333e-07, "loss": 0.035, "num_tokens": 2474818.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 155.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016922705108299851, "kl": 6.44649080641102e-05, "learning_rate": 5.31e-07, "loss": 0.0, "num_tokens": 2475090.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 155.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03561645373702049, "kl": 0.0010564652038738132, "learning_rate": 5.306666666666668e-07, "loss": 0.0001, "num_tokens": 2475347.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 155.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 6.05357551574707, "kl": 0.01311790058389306, "learning_rate": 5.303333333333333e-07, "loss": 0.069, "num_tokens": 2475610.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 155.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0975395143032074, "kl": 0.012654355959966779, "learning_rate": 5.3e-07, "loss": 0.0006, "num_tokens": 2475884.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 155.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.3723464012146, "kl": 0.013765290612354875, "learning_rate": 5.296666666666667e-07, "loss": -0.0336, "num_tokens": 2476164.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 8412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 155.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03530529513955116, "kl": 0.00457672169432044, "learning_rate": 5.293333333333333e-07, "loss": 0.0002, "num_tokens": 2476424.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 155.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08188877254724503, "kl": 0.008390740025788546, "learning_rate": 5.29e-07, "loss": 0.0004, "num_tokens": 2476726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 155.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03391231968998909, "kl": 0.0029532848857343197, "learning_rate": 5.286666666666667e-07, "loss": 0.0001, "num_tokens": 2476986.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 155.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.4728827476501465, "kl": 0.7759107742458582, "learning_rate": 5.283333333333333e-07, "loss": -0.0477, "num_tokens": 2477273.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 155.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.007028269115835428, "kl": 0.16169343888759613, "learning_rate": 5.28e-07, "loss": 0.0081, "num_tokens": 2477582.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 155.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.002486934419721365, "kl": 1.662224531173706e-05, "learning_rate": 5.276666666666667e-07, "loss": 0.0, "num_tokens": 2477794.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 155.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006349485483951867, "kl": 0.0013061835779808462, "learning_rate": 5.273333333333333e-07, "loss": 0.0001, "num_tokens": 2478071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 155.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02279621548950672, "kl": 0.04854956269264221, "learning_rate": 5.27e-07, "loss": 0.0024, "num_tokens": 2478413.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 155.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.05165979266166687, "kl": 0.015593254938721657, "learning_rate": 5.266666666666667e-07, "loss": 0.0008, "num_tokens": 2478760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 155.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.023921294137835503, "kl": 0.0012628336844500154, "learning_rate": 5.263333333333333e-07, "loss": 0.0001, "num_tokens": 2478976.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 155.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.038757625967264175, "kl": 0.0057800025679171085, "learning_rate": 5.26e-07, "loss": 0.0003, "num_tokens": 2479314.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 156.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04575493559241295, "kl": 0.005509747192263603, "learning_rate": 5.256666666666667e-07, "loss": 0.0003, "num_tokens": 2479626.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 156.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.004278066102415323, "kl": 0.001171433919807896, "learning_rate": 5.253333333333334e-07, "loss": 0.0001, "num_tokens": 2479886.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 156.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06770186126232147, "kl": 0.0031263157725334167, "learning_rate": 5.25e-07, "loss": 0.0002, "num_tokens": 2480102.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 156.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.317131996154785, "kl": 0.2115035280585289, "learning_rate": 5.246666666666667e-07, "loss": 0.1058, "num_tokens": 2480482.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 156.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.500802993774414, "kl": 0.5088626421420486, "learning_rate": 5.243333333333334e-07, "loss": 0.0665, "num_tokens": 2480768.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 156.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.015126118436455727, "kl": 0.00393518028431572, "learning_rate": 5.24e-07, "loss": 0.0002, "num_tokens": 2481026.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 156.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.048743266612291336, "kl": 0.004803207004442811, "learning_rate": 5.236666666666667e-07, "loss": 0.0002, "num_tokens": 2481328.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 156.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.01772715523838997, "kl": 0.005148844327777624, "learning_rate": 5.233333333333333e-07, "loss": 0.0003, "num_tokens": 2481596.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 156.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10989715158939362, "kl": 0.011333472561091185, "learning_rate": 5.23e-07, "loss": 0.0006, "num_tokens": 2481883.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 156.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.018481656908988953, "kl": 0.00038522534305229783, "learning_rate": 5.226666666666667e-07, "loss": 0.0, "num_tokens": 2482126.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 156.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08538296818733215, "kl": 0.0022356927511282265, "learning_rate": 5.223333333333334e-07, "loss": 0.0001, "num_tokens": 2482339.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 156.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.616231918334961, "kl": 0.26572537142783403, "learning_rate": 5.219999999999999e-07, "loss": 0.018, "num_tokens": 2482615.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 156.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.017566002905368805, "kl": 0.002983683720231056, "learning_rate": 5.216666666666667e-07, "loss": 0.0002, "num_tokens": 2482949.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 156.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.053898610174655914, "kl": 0.004574377555400133, "learning_rate": 5.213333333333334e-07, "loss": 0.0002, "num_tokens": 2483247.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 156.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 6.173861026763916, "kl": 0.09855138882994652, "learning_rate": 5.21e-07, "loss": 0.313, "num_tokens": 2483547.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 156.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.024702956900000572, "kl": 0.04162203148007393, "learning_rate": 5.206666666666666e-07, "loss": 0.0021, "num_tokens": 2483952.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 156.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.5571836829185486, "kl": 0.07365526258945465, "learning_rate": 5.203333333333334e-07, "loss": 0.0045, "num_tokens": 2484227.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 156.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.00032187625765800476, "kl": 8.128583431243896e-06, "learning_rate": 5.2e-07, "loss": 0.0, "num_tokens": 2484447.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 156.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02686251327395439, "kl": 0.005212683929130435, "learning_rate": 5.196666666666667e-07, "loss": 0.0003, "num_tokens": 2484735.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 156.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02043980360031128, "kl": 0.011767649091780186, "learning_rate": 5.193333333333334e-07, "loss": 0.0006, "num_tokens": 2484995.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 156.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.11391869187355042, "kl": 0.043839987367391586, "learning_rate": 5.189999999999999e-07, "loss": 0.0022, "num_tokens": 2485293.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 156.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.021185003221035004, "kl": 0.2648857831954956, "learning_rate": 5.186666666666667e-07, "loss": 0.0132, "num_tokens": 2485597.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 156.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017053665360435843, "kl": 4.1743118345038965e-05, "learning_rate": 5.183333333333334e-07, "loss": 0.0, "num_tokens": 2485869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 156.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03420504927635193, "kl": 0.0038915553595870733, "learning_rate": 5.18e-07, "loss": 0.0002, "num_tokens": 2486139.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 156.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.002330042188987136, "kl": 0.00031235069036483765, "learning_rate": 5.176666666666666e-07, "loss": 0.0, "num_tokens": 2486399.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 156.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04137839376926422, "kl": 0.007751275785267353, "learning_rate": 5.173333333333334e-07, "loss": 0.0004, "num_tokens": 2486694.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 156.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.022049132734537125, "kl": 0.0014656584244221449, "learning_rate": 5.17e-07, "loss": 0.0001, "num_tokens": 2486972.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 156.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.026242000982165337, "kl": 0.0016636699438095093, "learning_rate": 5.166666666666667e-07, "loss": 0.0001, "num_tokens": 2487184.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 156.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.028206482529640198, "kl": 0.055699342861771584, "learning_rate": 5.163333333333333e-07, "loss": 0.0028, "num_tokens": 2487520.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 156.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.015877556055784225, "kl": 0.00019871890981448814, "learning_rate": 5.16e-07, "loss": 0.0, "num_tokens": 2487776.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 156.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.023555463179945946, "kl": 0.07299611158668995, "learning_rate": 5.156666666666667e-07, "loss": 0.0037, "num_tokens": 2488147.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 156.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.009435631334781647, "kl": 0.15927845239639282, "learning_rate": 5.153333333333334e-07, "loss": 0.008, "num_tokens": 2488457.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 156.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.05060119181871414, "kl": 0.0022037744492990896, "learning_rate": 5.15e-07, "loss": 0.0001, "num_tokens": 2488676.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 156.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.15501393377780914, "kl": 0.016626416007056832, "learning_rate": 5.146666666666666e-07, "loss": 0.0009, "num_tokens": 2489008.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 156.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04478546977043152, "kl": 0.00516450684517622, "learning_rate": 5.143333333333334e-07, "loss": 0.0003, "num_tokens": 2489320.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 156.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.24517595767974854, "kl": 0.04788942728191614, "learning_rate": 5.140000000000001e-07, "loss": 0.0024, "num_tokens": 2489657.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 156.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.2047641277313232, "kl": 0.0207534022629261, "learning_rate": 5.136666666666666e-07, "loss": 0.0012, "num_tokens": 2490003.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 156.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.013539710082113743, "kl": 0.0007548865396529436, "learning_rate": 5.133333333333333e-07, "loss": 0.0, "num_tokens": 2490270.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 156.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.6001968383789062, "kl": 0.07414672523736954, "learning_rate": 5.13e-07, "loss": -0.0103, "num_tokens": 2490635.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 156.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01546122133731842, "kl": 0.006629306124523282, "learning_rate": 5.126666666666667e-07, "loss": 0.0003, "num_tokens": 2490927.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 156.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06342559307813644, "kl": 0.03193545900285244, "learning_rate": 5.123333333333334e-07, "loss": 0.0017, "num_tokens": 2491312.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 156.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.001116510247811675, "kl": 0.0037094801664352417, "learning_rate": 5.12e-07, "loss": 0.0002, "num_tokens": 2491548.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 156.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0412026047706604, "kl": 0.0005738437175750732, "learning_rate": 5.116666666666666e-07, "loss": 0.0, "num_tokens": 2491758.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 156.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005234990967437625, "kl": 0.00123817368876189, "learning_rate": 5.113333333333334e-07, "loss": 0.0001, "num_tokens": 2492038.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 156.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0093684708699584, "kl": 0.00044766482460545376, "learning_rate": 5.110000000000001e-07, "loss": 0.0, "num_tokens": 2492352.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 156.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.07600437104701996, "kl": 0.008421032456681132, "learning_rate": 5.106666666666666e-07, "loss": 0.0004, "num_tokens": 2492691.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 156.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05454156920313835, "kl": 0.002588084666058421, "learning_rate": 5.103333333333333e-07, "loss": 0.0001, "num_tokens": 2492963.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 156.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.008401497267186642, "kl": 0.0005766671383753419, "learning_rate": 5.100000000000001e-07, "loss": 0.0, "num_tokens": 2493198.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 156.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.23921537399291992, "kl": 0.02082074456848204, "learning_rate": 5.096666666666667e-07, "loss": 0.001, "num_tokens": 2493490.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 156.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03600386902689934, "kl": 0.013860939536243677, "learning_rate": 5.093333333333333e-07, "loss": 0.0005, "num_tokens": 2493882.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 156.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04453738406300545, "kl": 0.011568172369152308, "learning_rate": 5.09e-07, "loss": 0.0006, "num_tokens": 2494204.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 156.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.15386320650577545, "kl": 0.014382415916770697, "learning_rate": 5.086666666666666e-07, "loss": 0.0008, "num_tokens": 2494532.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 156.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0813414454460144, "kl": 0.018582265824079514, "learning_rate": 5.083333333333334e-07, "loss": 0.0009, "num_tokens": 2494834.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 156.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.37837982177734375, "kl": 0.033761862374376506, "learning_rate": 5.08e-07, "loss": 0.0019, "num_tokens": 2495118.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 157.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.14353086054325104, "kl": 0.020357735455036163, "learning_rate": 5.076666666666666e-07, "loss": 0.0011, "num_tokens": 2495392.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 157.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02693096548318863, "kl": 0.0017829835414886475, "learning_rate": 5.073333333333333e-07, "loss": 0.0001, "num_tokens": 2495604.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 157.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.000296739861369133, "kl": 6.8694353103637695e-06, "learning_rate": 5.070000000000001e-07, "loss": 0.0, "num_tokens": 2495824.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 157.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.1642432063817978, "kl": 0.02829754166305065, "learning_rate": 5.066666666666667e-07, "loss": 0.0014, "num_tokens": 2496126.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 157.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.004903525114059448, "kl": 7.014721632003784e-05, "learning_rate": 5.063333333333333e-07, "loss": 0.0, "num_tokens": 2496338.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 157.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0500800721347332, "kl": 0.002676622476428747, "learning_rate": 5.06e-07, "loss": 0.0001, "num_tokens": 2496592.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 157.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.028845123946666718, "kl": 0.0053834563586860895, "learning_rate": 5.056666666666667e-07, "loss": 0.0002, "num_tokens": 2496858.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 157.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03952033817768097, "kl": 0.01759189274162054, "learning_rate": 5.053333333333334e-07, "loss": 0.0009, "num_tokens": 2497157.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 157.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04855850338935852, "kl": 0.006311023724265397, "learning_rate": 5.05e-07, "loss": 0.0003, "num_tokens": 2497450.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 157.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.1092715710401535, "kl": 0.042482415214180946, "learning_rate": 5.046666666666667e-07, "loss": 0.0021, "num_tokens": 2497824.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 157.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.15142570436000824, "kl": 0.02115795575082302, "learning_rate": 5.043333333333333e-07, "loss": 0.0012, "num_tokens": 2498109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 157.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 14.66716480255127, "kl": 0.027197793126106262, "learning_rate": 5.040000000000001e-07, "loss": 0.1654, "num_tokens": 2498323.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 157.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05180661752820015, "kl": 0.011524613946676254, "learning_rate": 5.036666666666667e-07, "loss": 0.0005, "num_tokens": 2498651.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 157.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.023557983338832855, "kl": 0.0008277259767055511, "learning_rate": 5.033333333333333e-07, "loss": 0.0, "num_tokens": 2498894.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 157.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.016277238726615906, "kl": 0.0005233370466157794, "learning_rate": 5.03e-07, "loss": 0.0, "num_tokens": 2499207.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 157.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.14778617024421692, "kl": 0.008364896522834897, "learning_rate": 5.026666666666667e-07, "loss": 0.0004, "num_tokens": 2499464.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 157.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.017961518839001656, "kl": 0.0011804367823060602, "learning_rate": 5.023333333333333e-07, "loss": 0.0001, "num_tokens": 2499740.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 157.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.003330766223371029, "kl": 0.0003237202763557434, "learning_rate": 5.02e-07, "loss": 0.0, "num_tokens": 2500000.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 157.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.1286805421113968, "kl": 0.03825647942721844, "learning_rate": 5.016666666666667e-07, "loss": 0.002, "num_tokens": 2500269.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 157.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.6071343421936035, "kl": 0.07391935959458351, "learning_rate": 5.013333333333333e-07, "loss": 0.1767, "num_tokens": 2500615.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 157.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.00044517318019643426, "kl": 0.0012253480963408947, "learning_rate": 5.01e-07, "loss": 0.0001, "num_tokens": 2500895.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 157.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04226859286427498, "kl": 0.00385933555662632, "learning_rate": 5.006666666666667e-07, "loss": 0.0002, "num_tokens": 2501207.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 157.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 2.41400408744812, "kl": 0.132224190980196, "learning_rate": 5.003333333333333e-07, "loss": 0.0363, "num_tokens": 2501593.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 8500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 157.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.029483066871762276, "kl": 0.002384966181125492, "learning_rate": 5e-07, "loss": 0.0001, "num_tokens": 2501924.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 157.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 1.1090335845947266, "kl": 0.1020755278877914, "learning_rate": 4.996666666666668e-07, "loss": 0.0041, "num_tokens": 2502212.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 157.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.9743456840515137, "kl": 0.10830906359478831, "learning_rate": 4.993333333333333e-07, "loss": -0.0044, "num_tokens": 2502554.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 157.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.014132065698504448, "kl": 0.07349956035614014, "learning_rate": 4.99e-07, "loss": 0.0037, "num_tokens": 2502924.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 157.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.020838946104049683, "kl": 0.2649073898792267, "learning_rate": 4.986666666666667e-07, "loss": 0.0132, "num_tokens": 2503228.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 157.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07360532134771347, "kl": 0.038996681571006775, "learning_rate": 4.983333333333333e-07, "loss": 0.0019, "num_tokens": 2503540.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 157.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01846860721707344, "kl": 0.002976749907247722, "learning_rate": 4.98e-07, "loss": 0.0002, "num_tokens": 2503872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 157.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02639959193766117, "kl": 0.0014309905236586928, "learning_rate": 4.976666666666667e-07, "loss": 0.0001, "num_tokens": 2504176.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 157.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.622972011566162, "kl": 0.09838047996163368, "learning_rate": 4.973333333333333e-07, "loss": 0.1487, "num_tokens": 2504535.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 157.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02174372412264347, "kl": 0.0007273862429428846, "learning_rate": 4.97e-07, "loss": 0.0, "num_tokens": 2504797.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 157.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 9.416647911071777, "kl": 2.1433481190761086, "learning_rate": 4.966666666666667e-07, "loss": 0.1829, "num_tokens": 2505035.0, "reward": 2.5, "reward_std": 3.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.0, "step": 8511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 157.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.08385979384183884, "kl": 0.014134705998003483, "learning_rate": 4.963333333333333e-07, "loss": 0.0007, "num_tokens": 2505361.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 157.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.020961308851838112, "kl": 0.005719877779483795, "learning_rate": 4.96e-07, "loss": 0.0003, "num_tokens": 2505629.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 157.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.07840628921985626, "kl": 0.011885132640600204, "learning_rate": 4.956666666666667e-07, "loss": 0.0006, "num_tokens": 2505935.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 157.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.008504621684551239, "kl": 0.0007483886438421905, "learning_rate": 4.953333333333333e-07, "loss": 0.0, "num_tokens": 2506215.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 157.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.12445389479398727, "kl": 0.023703727638348937, "learning_rate": 4.95e-07, "loss": 0.0012, "num_tokens": 2506597.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 157.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.011719525791704655, "kl": 0.007828783709555864, "learning_rate": 4.946666666666667e-07, "loss": 0.0004, "num_tokens": 2506869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 157.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.7693305015563965, "kl": 0.0615805983543396, "learning_rate": 4.943333333333334e-07, "loss": -0.0955, "num_tokens": 2507225.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 157.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.02838600054383278, "kl": 0.012475146446377039, "learning_rate": 4.94e-07, "loss": 0.0007, "num_tokens": 2507497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 157.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.10307072103023529, "kl": 0.019812828861176968, "learning_rate": 4.936666666666667e-07, "loss": 0.001, "num_tokens": 2507787.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 157.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03024272806942463, "kl": 0.03816715348511934, "learning_rate": 4.933333333333334e-07, "loss": 0.0019, "num_tokens": 2508192.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 157.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.021345676854252815, "kl": 0.0006064012777642347, "learning_rate": 4.93e-07, "loss": 0.0, "num_tokens": 2508411.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 157.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.12507128715515137, "kl": 0.17312154173851013, "learning_rate": 4.926666666666667e-07, "loss": 0.0086, "num_tokens": 2508721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 157.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011349762789905071, "kl": 0.0036967843770980835, "learning_rate": 4.923333333333333e-07, "loss": 0.0002, "num_tokens": 2508957.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 157.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.16203093528747559, "kl": 0.017759868002031, "learning_rate": 4.92e-07, "loss": 0.0009, "num_tokens": 2509285.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 157.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03434457257390022, "kl": 0.010937778744846582, "learning_rate": 4.916666666666667e-07, "loss": 0.0005, "num_tokens": 2509546.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 157.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.006401673890650272, "kl": 0.0007060051138978451, "learning_rate": 4.913333333333334e-07, "loss": 0.0, "num_tokens": 2509806.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 157.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 6.46056604385376, "kl": 1.0507410652935505, "learning_rate": 4.909999999999999e-07, "loss": 0.0751, "num_tokens": 2510115.0, "reward": 2.5, "reward_std": 3.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.0, "step": 8528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 157.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02702411264181137, "kl": 0.0009557564044371247, "learning_rate": 4.906666666666667e-07, "loss": 0.0, "num_tokens": 2510411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 157.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02450629696249962, "kl": 0.0019890672992914915, "learning_rate": 4.903333333333334e-07, "loss": 0.0001, "num_tokens": 2510684.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 157.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0746566578745842, "kl": 0.015292820055037737, "learning_rate": 4.9e-07, "loss": 0.0008, "num_tokens": 2510967.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06476012617349625, "kl": 0.0033435896039009094, "learning_rate": 4.896666666666666e-07, "loss": 0.0002, "num_tokens": 2511183.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 158.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.4616315364837646, "kl": 0.100852370262146, "learning_rate": 4.893333333333334e-07, "loss": 0.0126, "num_tokens": 2511562.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 158.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0408039428293705, "kl": 0.0022240668768063188, "learning_rate": 4.89e-07, "loss": 0.0001, "num_tokens": 2511834.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 158.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 3.360039710998535, "kl": 0.0472066942602396, "learning_rate": 4.886666666666667e-07, "loss": -0.1574, "num_tokens": 2512183.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 158.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031456961296498775, "kl": 0.0016734758391976357, "learning_rate": 4.883333333333334e-07, "loss": 0.0001, "num_tokens": 2512495.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 158.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02307688631117344, "kl": 0.001133069396018982, "learning_rate": 4.879999999999999e-07, "loss": 0.0, "num_tokens": 2512711.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.024726692587137222, "kl": 0.00523054925724864, "learning_rate": 4.876666666666667e-07, "loss": 0.0003, "num_tokens": 2512989.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 158.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.00701464107260108, "kl": 0.0006848298653494567, "learning_rate": 4.873333333333334e-07, "loss": 0.0, "num_tokens": 2513273.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 158.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08706600219011307, "kl": 0.03382064402103424, "learning_rate": 4.87e-07, "loss": 0.0017, "num_tokens": 2513570.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.07695037126541138, "kl": 0.0071961539797484875, "learning_rate": 4.866666666666666e-07, "loss": 0.0004, "num_tokens": 2513789.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 7.12135442881845e-05, "kl": 2.332031726837158e-06, "learning_rate": 4.863333333333334e-07, "loss": 0.0, "num_tokens": 2514009.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 158.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08905328065156937, "kl": 0.019308204296976328, "learning_rate": 4.86e-07, "loss": 0.001, "num_tokens": 2514295.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 4.437072277069092, "kl": 0.42022114992141724, "learning_rate": 4.856666666666667e-07, "loss": 0.028, "num_tokens": 2514535.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 158.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.01061983872205019, "kl": 0.00872632977552712, "learning_rate": 4.853333333333333e-07, "loss": 0.0004, "num_tokens": 2514809.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 158.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 0.9914193153381348, "kl": 0.006183861289173365, "learning_rate": 4.85e-07, "loss": -0.0005, "num_tokens": 2515097.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 158.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.13980522751808167, "kl": 0.009371042484417558, "learning_rate": 4.846666666666667e-07, "loss": 0.0005, "num_tokens": 2515362.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 158.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005353864398784935, "kl": 0.0012410475173965096, "learning_rate": 4.843333333333334e-07, "loss": 0.0001, "num_tokens": 2515642.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 158.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.028003400191664696, "kl": 0.0005597322960966267, "learning_rate": 4.84e-07, "loss": 0.0, "num_tokens": 2515898.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01123598963022232, "kl": 0.0010447597014717758, "learning_rate": 4.836666666666666e-07, "loss": 0.0001, "num_tokens": 2516166.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 158.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.022090567275881767, "kl": 0.2647307515144348, "learning_rate": 4.833333333333334e-07, "loss": 0.0132, "num_tokens": 2516470.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 158.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.029005344957113266, "kl": 0.03795054741203785, "learning_rate": 4.830000000000001e-07, "loss": 0.0019, "num_tokens": 2516875.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 158.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02571025677025318, "kl": 0.0015140259929466993, "learning_rate": 4.826666666666666e-07, "loss": 0.0001, "num_tokens": 2517197.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.036035045981407166, "kl": 0.0022087815450504422, "learning_rate": 4.823333333333333e-07, "loss": 0.0001, "num_tokens": 2517473.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 158.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.024019135162234306, "kl": 0.002115154347848147, "learning_rate": 4.82e-07, "loss": 0.0001, "num_tokens": 2517797.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 158.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.004839766770601273, "kl": 0.0002545595198171213, "learning_rate": 4.816666666666667e-07, "loss": 0.0, "num_tokens": 2518017.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 158.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.023963728919625282, "kl": 0.0013586296408902854, "learning_rate": 4.813333333333334e-07, "loss": 0.0001, "num_tokens": 2518252.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 158.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.8045413494110107, "kl": 0.10301024094223976, "learning_rate": 4.81e-07, "loss": 0.0596, "num_tokens": 2518598.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 158.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.20957903563976288, "kl": 0.03722521383315325, "learning_rate": 4.806666666666666e-07, "loss": 0.0026, "num_tokens": 2518876.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 158.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.066530704498291, "kl": 0.047671230509877205, "learning_rate": 4.803333333333334e-07, "loss": 0.0623, "num_tokens": 2519193.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 158.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0672617256641388, "kl": 0.00913128606043756, "learning_rate": 4.800000000000001e-07, "loss": 0.0005, "num_tokens": 2519499.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 158.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.16807205975055695, "kl": 0.02666935371235013, "learning_rate": 4.796666666666666e-07, "loss": 0.0014, "num_tokens": 2519792.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.044456418603658676, "kl": 0.004684043116867542, "learning_rate": 4.793333333333333e-07, "loss": 0.0002, "num_tokens": 2520060.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 158.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.09888976812362671, "kl": 0.01866341568529606, "learning_rate": 4.790000000000001e-07, "loss": 0.0009, "num_tokens": 2520389.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 158.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02013307251036167, "kl": 0.011850446462631226, "learning_rate": 4.786666666666667e-07, "loss": 0.0006, "num_tokens": 2520649.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 158.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.201779842376709, "kl": 0.22302223520819098, "learning_rate": 4.783333333333333e-07, "loss": -0.0886, "num_tokens": 2520911.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.001245591207407415, "kl": 2.7805566787719727e-05, "learning_rate": 4.78e-07, "loss": 0.0, "num_tokens": 2521123.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.010588799603283405, "kl": 0.0011360293137840927, "learning_rate": 4.776666666666666e-07, "loss": 0.0001, "num_tokens": 2521419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 158.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.038274750113487244, "kl": 0.0065190650057047606, "learning_rate": 4.773333333333334e-07, "loss": 0.0003, "num_tokens": 2521713.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 158.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.7623893022537231, "kl": 0.08423770777881145, "learning_rate": 4.77e-07, "loss": 0.0671, "num_tokens": 2522081.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 158.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.8867781162261963, "kl": 0.11864447966217995, "learning_rate": 4.7666666666666667e-07, "loss": 0.0059, "num_tokens": 2522381.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 158.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.3298551142215729, "kl": 0.014585547847673297, "learning_rate": 4.763333333333333e-07, "loss": 0.0007, "num_tokens": 2522710.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 158.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.06043326482176781, "kl": 0.012919301632791758, "learning_rate": 4.7600000000000003e-07, "loss": 0.0005, "num_tokens": 2523024.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 158.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.008054965175688267, "kl": 0.0014065116411074996, "learning_rate": 4.756666666666667e-07, "loss": 0.0001, "num_tokens": 2523284.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 158.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.022008635103702545, "kl": 0.000444166362285614, "learning_rate": 4.7533333333333333e-07, "loss": 0.0, "num_tokens": 2523490.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 158.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 0.5160182118415833, "kl": 0.42667537182569504, "learning_rate": 4.75e-07, "loss": 0.0007, "num_tokens": 2523860.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 8576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 158.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.10601342469453812, "kl": 0.030844644643366337, "learning_rate": 4.7466666666666663e-07, "loss": 0.0017, "num_tokens": 2524177.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06133238226175308, "kl": 0.0183323142118752, "learning_rate": 4.7433333333333336e-07, "loss": 0.0009, "num_tokens": 2524446.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 158.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.00484099518507719, "kl": 0.00039897486567497253, "learning_rate": 4.7400000000000004e-07, "loss": 0.0, "num_tokens": 2524706.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 158.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.060721710324287415, "kl": 0.0049377307295799255, "learning_rate": 4.7366666666666666e-07, "loss": 0.0003, "num_tokens": 2524961.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 158.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 4.252732276916504, "kl": 0.09712276276695775, "learning_rate": 4.7333333333333334e-07, "loss": 0.1094, "num_tokens": 2525282.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 8581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 158.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.158277750015259, "kl": 0.05956242233514786, "learning_rate": 4.7300000000000007e-07, "loss": -0.0109, "num_tokens": 2525648.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 158.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03432878479361534, "kl": 0.007546215783804655, "learning_rate": 4.726666666666667e-07, "loss": 0.0004, "num_tokens": 2525938.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 158.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.027142107486724854, "kl": 0.007473616395145655, "learning_rate": 4.723333333333333e-07, "loss": 0.0004, "num_tokens": 2526230.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 158.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008273106999695301, "kl": 0.00010346919589210302, "learning_rate": 4.72e-07, "loss": 0.0, "num_tokens": 2526498.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 159.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.6882164478302, "kl": 0.15949007868766785, "learning_rate": 4.716666666666667e-07, "loss": 0.2016, "num_tokens": 2526828.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 159.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005431402008980513, "kl": 0.00021148494124645367, "learning_rate": 4.7133333333333335e-07, "loss": 0.0, "num_tokens": 2527050.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.13048556447029114, "kl": 0.012707748916000128, "learning_rate": 4.71e-07, "loss": 0.0006, "num_tokens": 2527374.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02913927286863327, "kl": 0.006163935177028179, "learning_rate": 4.7066666666666665e-07, "loss": 0.0003, "num_tokens": 2527664.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 159.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.3855316936969757, "kl": 0.06308460980653763, "learning_rate": 4.703333333333333e-07, "loss": 0.0032, "num_tokens": 2527998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 159.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027163501363247633, "kl": 0.00025666777219157666, "learning_rate": 4.7000000000000005e-07, "loss": 0.0, "num_tokens": 2528260.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 159.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.005124661140143871, "kl": 0.0015183575451374054, "learning_rate": 4.696666666666667e-07, "loss": 0.0001, "num_tokens": 2528572.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 159.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.007446943782269955, "kl": 0.0014121129643172026, "learning_rate": 4.6933333333333335e-07, "loss": 0.0001, "num_tokens": 2528849.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 159.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.056365855038166046, "kl": 0.04130646586418152, "learning_rate": 4.69e-07, "loss": 0.0021, "num_tokens": 2529147.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.036859773099422455, "kl": 0.004151446162723005, "learning_rate": 4.686666666666667e-07, "loss": 0.0002, "num_tokens": 2529438.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 159.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06409481167793274, "kl": 0.010138588957488537, "learning_rate": 4.683333333333334e-07, "loss": 0.0005, "num_tokens": 2529754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 159.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.024869225919246674, "kl": 0.00039105117321014404, "learning_rate": 4.68e-07, "loss": 0.0, "num_tokens": 2529967.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 159.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.022369252517819405, "kl": 0.011333707720041275, "learning_rate": 4.676666666666667e-07, "loss": 0.0006, "num_tokens": 2530227.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 159.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.020912325009703636, "kl": 0.005029738647863269, "learning_rate": 4.673333333333333e-07, "loss": 0.0003, "num_tokens": 2530564.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 159.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.023824648931622505, "kl": 0.1605878844857216, "learning_rate": 4.6700000000000004e-07, "loss": 0.008, "num_tokens": 2530874.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 159.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.023516716435551643, "kl": 0.004270400386303663, "learning_rate": 4.6666666666666666e-07, "loss": 0.0002, "num_tokens": 2531142.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 159.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.035752128809690475, "kl": 0.026269923895597458, "learning_rate": 4.6633333333333334e-07, "loss": 0.0013, "num_tokens": 2531515.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 159.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03827158734202385, "kl": 0.00623775040730834, "learning_rate": 4.6599999999999996e-07, "loss": 0.0003, "num_tokens": 2531816.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 159.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06170351058244705, "kl": 0.009488976560533047, "learning_rate": 4.656666666666667e-07, "loss": 0.0005, "num_tokens": 2532145.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 159.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009644703241065145, "kl": 6.021261287969537e-05, "learning_rate": 4.6533333333333337e-07, "loss": 0.0, "num_tokens": 2532401.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 159.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.00020436133490875363, "kl": 4.559755325317383e-06, "learning_rate": 4.65e-07, "loss": 0.0, "num_tokens": 2532621.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 159.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.36243778467178345, "kl": 0.062055718153715134, "learning_rate": 4.6466666666666667e-07, "loss": 0.0029, "num_tokens": 2532963.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 159.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.9392231106758118, "kl": 0.14744428172707558, "learning_rate": 4.643333333333333e-07, "loss": 0.0101, "num_tokens": 2533274.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 159.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.8875181674957275, "kl": 0.006092198193073273, "learning_rate": 4.64e-07, "loss": 0.0254, "num_tokens": 2533545.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 159.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.08360299468040466, "kl": 0.024957576766610146, "learning_rate": 4.636666666666667e-07, "loss": 0.0013, "num_tokens": 2533854.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 159.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.13966958224773407, "kl": 0.04225391894578934, "learning_rate": 4.6333333333333333e-07, "loss": 0.0021, "num_tokens": 2534191.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 159.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.8296147584915161, "kl": 0.14229894056916237, "learning_rate": 4.63e-07, "loss": 0.0817, "num_tokens": 2534613.0, "reward": 2.174999952316284, "reward_std": 1.1786290407180786, "rewards/reward_combined/mean": 2.174999952316284, "rewards/reward_combined/std": 1.1786291599273682, "step": 8612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 159.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02089979499578476, "kl": 0.0007220010011224076, "learning_rate": 4.6266666666666673e-07, "loss": 0.0, "num_tokens": 2534829.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 159.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.557019233703613, "kl": 0.11554973479360342, "learning_rate": 4.6233333333333336e-07, "loss": 0.0259, "num_tokens": 2535104.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 159.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03735779598355293, "kl": 0.002009578049182892, "learning_rate": 4.62e-07, "loss": 0.0001, "num_tokens": 2535348.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 159.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02146579697728157, "kl": 0.26481232047080994, "learning_rate": 4.6166666666666666e-07, "loss": 0.0132, "num_tokens": 2535652.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 159.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04026374593377113, "kl": 0.0018839865952031687, "learning_rate": 4.613333333333334e-07, "loss": 0.0001, "num_tokens": 2535950.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 159.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 5.249157905578613, "kl": 0.33205313235521317, "learning_rate": 4.61e-07, "loss": 0.2582, "num_tokens": 2536173.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 8618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 159.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0649443045258522, "kl": 0.0010302364826202393, "learning_rate": 4.606666666666667e-07, "loss": 0.0001, "num_tokens": 2536379.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 159.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019475027220323682, "kl": 0.003537513315677643, "learning_rate": 4.603333333333333e-07, "loss": 0.0002, "num_tokens": 2536615.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 159.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.019151508808135986, "kl": 0.0006709507724735886, "learning_rate": 4.6e-07, "loss": 0.0, "num_tokens": 2536884.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 159.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.007090226747095585, "kl": 0.00010500549979042262, "learning_rate": 4.596666666666667e-07, "loss": 0.0, "num_tokens": 2537154.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01995955966413021, "kl": 0.0008948360045906156, "learning_rate": 4.5933333333333334e-07, "loss": 0.0, "num_tokens": 2537434.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 159.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.824875831604004, "kl": 0.0568277578568086, "learning_rate": 4.59e-07, "loss": 0.353, "num_tokens": 2537811.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 159.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0163147933781147, "kl": 0.0007652853382751346, "learning_rate": 4.5866666666666664e-07, "loss": 0.0, "num_tokens": 2538046.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 159.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0985773429274559, "kl": 0.006719287019222975, "learning_rate": 4.583333333333334e-07, "loss": 0.0003, "num_tokens": 2538307.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 159.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.08132348954677582, "kl": 0.0070319268852472305, "learning_rate": 4.5800000000000005e-07, "loss": 0.0004, "num_tokens": 2538605.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 159.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.19190256297588348, "kl": 0.03531090263277292, "learning_rate": 4.576666666666667e-07, "loss": 0.0017, "num_tokens": 2538890.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 159.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02645549736917019, "kl": 0.008679892867803574, "learning_rate": 4.573333333333333e-07, "loss": 0.0004, "num_tokens": 2539182.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 159.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004941096995025873, "kl": 0.00041915103793144226, "learning_rate": 4.57e-07, "loss": 0.0, "num_tokens": 2539442.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 159.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.08969242125749588, "kl": 0.017657252494245768, "learning_rate": 4.566666666666667e-07, "loss": 0.0009, "num_tokens": 2539728.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8631 }, { "clip_ratio/high_max": 0.0055555556900799274, "clip_ratio/high_mean": 0.0055555556900799274, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0055555556900799274, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 159.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.9159317016601562, "kl": 0.10721618309617043, "learning_rate": 4.5633333333333333e-07, "loss": 0.0006, "num_tokens": 2540110.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.036441702395677567, "kl": 0.030656098388135433, "learning_rate": 4.56e-07, "loss": 0.0015, "num_tokens": 2540384.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 159.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.025416595861315727, "kl": 0.09495680779218674, "learning_rate": 4.5566666666666663e-07, "loss": 0.0047, "num_tokens": 2540756.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 159.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.351839303970337, "kl": 0.1992230974137783, "learning_rate": 4.5533333333333336e-07, "loss": 0.0441, "num_tokens": 2541142.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 159.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.3036472797393799, "kl": 0.10299092531204224, "learning_rate": 4.5500000000000004e-07, "loss": 0.0266, "num_tokens": 2541480.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02982376702129841, "kl": 0.0021069254144094884, "learning_rate": 4.5466666666666666e-07, "loss": 0.0001, "num_tokens": 2541809.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 159.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04965667426586151, "kl": 0.012602425646036863, "learning_rate": 4.5433333333333334e-07, "loss": 0.0006, "num_tokens": 2542113.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 159.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03606492653489113, "kl": 0.005830902606248856, "learning_rate": 4.5399999999999996e-07, "loss": 0.0003, "num_tokens": 2542399.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 160.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.07060271501541138, "kl": 0.003959645750001073, "learning_rate": 4.536666666666667e-07, "loss": 0.0002, "num_tokens": 2542658.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 160.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037361718714237213, "kl": 0.0019822437316179276, "learning_rate": 4.5333333333333337e-07, "loss": 0.0001, "num_tokens": 2542970.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8641 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 160.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.3153436183929443, "kl": 0.09401218220591545, "learning_rate": 4.53e-07, "loss": -0.0555, "num_tokens": 2543348.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 160.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02069438062608242, "kl": 0.0004439055919647217, "learning_rate": 4.5266666666666667e-07, "loss": 0.0, "num_tokens": 2543558.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0434831865131855, "kl": 0.004447659943252802, "learning_rate": 4.523333333333334e-07, "loss": 0.0002, "num_tokens": 2543850.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 160.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 3.218278169631958, "kl": 0.08415662194602191, "learning_rate": 4.52e-07, "loss": -0.0122, "num_tokens": 2544143.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 160.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.022728722542524338, "kl": 0.0007326866034418344, "learning_rate": 4.5166666666666665e-07, "loss": 0.0, "num_tokens": 2544394.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 160.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.004622384440153837, "kl": 0.00038431957364082336, "learning_rate": 4.513333333333333e-07, "loss": 0.0, "num_tokens": 2544654.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 160.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.001387011376209557, "kl": 3.723055124282837e-05, "learning_rate": 4.5100000000000005e-07, "loss": 0.0, "num_tokens": 2544866.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 160.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.02367924153804779, "kl": 0.0006719735392834991, "learning_rate": 4.506666666666667e-07, "loss": 0.0, "num_tokens": 2545100.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 160.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05397434160113335, "kl": 0.16236093640327454, "learning_rate": 4.5033333333333336e-07, "loss": 0.0081, "num_tokens": 2545410.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03227872774004936, "kl": 0.002121766214258969, "learning_rate": 4.5e-07, "loss": 0.0001, "num_tokens": 2545683.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 160.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.024951478466391563, "kl": 0.0016541053773835301, "learning_rate": 4.4966666666666666e-07, "loss": 0.0001, "num_tokens": 2545957.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 4.1402907371521, "kl": 0.019108325242996216, "learning_rate": 4.493333333333334e-07, "loss": 0.122, "num_tokens": 2546249.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 160.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.882719874382019, "kl": 0.14848625287413597, "learning_rate": 4.49e-07, "loss": 0.0184, "num_tokens": 2546590.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 8654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 160.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010141506209038198, "kl": 2.346932888031006e-06, "learning_rate": 4.486666666666667e-07, "loss": 0.0, "num_tokens": 2546810.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 160.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.022499442100524902, "kl": 0.007729256059974432, "learning_rate": 4.483333333333333e-07, "loss": 0.0004, "num_tokens": 2547104.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 160.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021007382310926914, "kl": 4.374980926513672e-05, "learning_rate": 4.4800000000000004e-07, "loss": 0.0, "num_tokens": 2547360.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 160.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.035914354026317596, "kl": 0.007574115530587733, "learning_rate": 4.476666666666667e-07, "loss": 0.0004, "num_tokens": 2547703.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 160.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019422711338847876, "kl": 0.0035448744893074036, "learning_rate": 4.4733333333333334e-07, "loss": 0.0002, "num_tokens": 2547939.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.04819255694746971, "kl": 0.02114738430827856, "learning_rate": 4.4699999999999997e-07, "loss": 0.0012, "num_tokens": 2548219.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.007517922669649124, "kl": 0.0014330648118630052, "learning_rate": 4.4666666666666664e-07, "loss": 0.0001, "num_tokens": 2548496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 160.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201191395521164, "kl": 0.040469877421855927, "learning_rate": 4.4633333333333337e-07, "loss": 0.002, "num_tokens": 2548856.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 160.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.018833497539162636, "kl": 0.0008993387164082378, "learning_rate": 4.46e-07, "loss": 0.0, "num_tokens": 2549184.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.13987883925437927, "kl": 0.027296412270516157, "learning_rate": 4.456666666666667e-07, "loss": 0.0014, "num_tokens": 2549462.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 61.75, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 61.75, "completions/mean_terminated_length": 61.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.8982056379318237, "kl": 0.011345220729708672, "learning_rate": 4.453333333333333e-07, "loss": 0.4524, "num_tokens": 2549933.0, "reward": 6.050000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 6.050000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 8665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 160.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02985193394124508, "kl": 0.004056473029777408, "learning_rate": 4.4500000000000003e-07, "loss": 0.0002, "num_tokens": 2550262.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 76.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.0085318088531494, "kl": 0.048484018072485924, "learning_rate": 4.446666666666667e-07, "loss": 0.4558, "num_tokens": 2550789.0, "reward": 5.675000190734863, "reward_std": 3.6499998569488525, "rewards/reward_combined/mean": 5.675000190734863, "rewards/reward_combined/std": 3.6500000953674316, "step": 8667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06846870481967926, "kl": 0.010346206836402416, "learning_rate": 4.4433333333333333e-07, "loss": 0.0005, "num_tokens": 2551078.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.8624725341796875, "kl": 0.14514271169900894, "learning_rate": 4.44e-07, "loss": -0.2538, "num_tokens": 2551381.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 8669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 160.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03502403572201729, "kl": 0.004048785107443109, "learning_rate": 4.4366666666666663e-07, "loss": 0.0002, "num_tokens": 2551639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 160.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07920677214860916, "kl": 0.008172958623617887, "learning_rate": 4.4333333333333336e-07, "loss": 0.0004, "num_tokens": 2551944.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 160.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.032318998128175735, "kl": 0.00595466373488307, "learning_rate": 4.4300000000000004e-07, "loss": 0.0003, "num_tokens": 2552325.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 160.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.020329784601926804, "kl": 0.006174417831061874, "learning_rate": 4.4266666666666666e-07, "loss": 0.0003, "num_tokens": 2552597.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 160.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.08013773709535599, "kl": 0.00739249074831605, "learning_rate": 4.4233333333333334e-07, "loss": 0.0004, "num_tokens": 2552816.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 160.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09457864612340927, "kl": 0.042728934437036514, "learning_rate": 4.4200000000000007e-07, "loss": 0.002, "num_tokens": 2553193.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 160.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.25403594970703125, "kl": 0.040088089561322704, "learning_rate": 4.416666666666667e-07, "loss": 0.0023, "num_tokens": 2553457.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06741530448198318, "kl": 0.007070574734825641, "learning_rate": 4.413333333333333e-07, "loss": 0.0004, "num_tokens": 2553755.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.010327140800654888, "kl": 0.0006848268094472587, "learning_rate": 4.41e-07, "loss": 0.0, "num_tokens": 2554041.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 160.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016408642986789346, "kl": 0.00011049045861000195, "learning_rate": 4.406666666666667e-07, "loss": 0.0, "num_tokens": 2554353.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 160.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.021787144243717194, "kl": 0.0009699314832687378, "learning_rate": 4.4033333333333335e-07, "loss": 0.0, "num_tokens": 2554565.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 160.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04410265013575554, "kl": 0.014314462430775166, "learning_rate": 4.4e-07, "loss": 0.0007, "num_tokens": 2554869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 160.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04200639948248863, "kl": 0.011541795916855335, "learning_rate": 4.3966666666666665e-07, "loss": 0.0006, "num_tokens": 2555204.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018939394503831863, "clip_ratio/low_min": 0.0018939394503831863, "clip_ratio/region_mean": 0.0018939394503831863, "completion_length": 70.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 70.5, "completions/mean_terminated_length": 8.666666984558105, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 160.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.058505058288574, "kl": 0.003887388505972922, "learning_rate": 4.393333333333333e-07, "loss": 0.47, "num_tokens": 2555706.0, "reward": 7.300000190734863, "reward_std": 0.40000009536743164, "rewards/reward_combined/mean": 7.300000190734863, "rewards/reward_combined/std": 0.40000009536743164, "step": 8683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 160.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02446591481566429, "kl": 0.03118347004055977, "learning_rate": 4.3900000000000005e-07, "loss": 0.0016, "num_tokens": 2556112.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 160.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.6730837225914001, "kl": 0.06484004156664014, "learning_rate": 4.386666666666667e-07, "loss": 0.0029, "num_tokens": 2556408.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 160.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.0480692386627197, "kl": 0.057574307546019554, "learning_rate": 4.3833333333333335e-07, "loss": -0.0269, "num_tokens": 2556773.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 160.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.021530449390411377, "kl": 0.264899879693985, "learning_rate": 4.38e-07, "loss": 0.0132, "num_tokens": 2557077.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 160.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.004730751272290945, "kl": 0.00020885467529296875, "learning_rate": 4.376666666666667e-07, "loss": 0.0, "num_tokens": 2557297.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 160.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.4174129962921143, "kl": 0.07839278131723404, "learning_rate": 4.373333333333334e-07, "loss": -0.0481, "num_tokens": 2557599.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 160.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.1381855010986328, "kl": 0.013612152077257633, "learning_rate": 4.37e-07, "loss": 0.0009, "num_tokens": 2557941.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 160.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.08004219084978104, "kl": 0.010712900198996067, "learning_rate": 4.3666666666666663e-07, "loss": 0.0006, "num_tokens": 2558261.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 160.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.027102116495370865, "kl": 0.0008121976570691913, "learning_rate": 4.363333333333333e-07, "loss": 0.0, "num_tokens": 2558528.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 160.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.020501334220170975, "kl": 0.011936224065721035, "learning_rate": 4.3600000000000004e-07, "loss": 0.0006, "num_tokens": 2558788.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 161.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.028638113290071487, "kl": 0.09418239071965218, "learning_rate": 4.3566666666666666e-07, "loss": 0.0047, "num_tokens": 2559161.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 161.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.039101943373680115, "kl": 0.0010855465952772647, "learning_rate": 4.3533333333333334e-07, "loss": 0.0001, "num_tokens": 2559417.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 161.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009871583431959152, "kl": 0.0007590443128719926, "learning_rate": 4.3499999999999996e-07, "loss": 0.0, "num_tokens": 2559677.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 161.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02007290907204151, "kl": 0.0029251491650938988, "learning_rate": 4.346666666666667e-07, "loss": 0.0001, "num_tokens": 2559989.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 161.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.004962783306837082, "kl": 5.539953781408258e-05, "learning_rate": 4.3433333333333337e-07, "loss": 0.0, "num_tokens": 2560202.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 5.6566057205200195, "kl": 0.04266488179564476, "learning_rate": 4.34e-07, "loss": 0.078, "num_tokens": 2560487.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 161.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.5060837864875793, "kl": 0.03299644310027361, "learning_rate": 4.3366666666666667e-07, "loss": 0.0022, "num_tokens": 2560769.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 161.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.2146897315979, "kl": 0.03656699322164059, "learning_rate": 4.333333333333333e-07, "loss": 0.0013, "num_tokens": 2561124.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 161.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05218846723437309, "kl": 0.0114898094907403, "learning_rate": 4.33e-07, "loss": 0.0006, "num_tokens": 2561449.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 161.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.15996979176998138, "kl": 0.0243934728205204, "learning_rate": 4.326666666666667e-07, "loss": 0.0012, "num_tokens": 2561758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 161.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.029595768079161644, "kl": 0.09536663070321083, "learning_rate": 4.3233333333333333e-07, "loss": 0.0048, "num_tokens": 2562130.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 161.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.045077454298734665, "kl": 0.0015447183977812529, "learning_rate": 4.32e-07, "loss": 0.0001, "num_tokens": 2562394.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 161.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006517356378026307, "kl": 0.0012625583331100643, "learning_rate": 4.3166666666666673e-07, "loss": 0.0001, "num_tokens": 2562674.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 161.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.033829428255558014, "kl": 0.001004789024591446, "learning_rate": 4.3133333333333336e-07, "loss": 0.0001, "num_tokens": 2562934.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 161.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05706070363521576, "kl": 0.004463211516849697, "learning_rate": 4.31e-07, "loss": 0.0002, "num_tokens": 2563259.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05499446019530296, "kl": 0.013950803317129612, "learning_rate": 4.3066666666666666e-07, "loss": 0.0008, "num_tokens": 2563533.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.846357822418213, "kl": 0.017044548789272085, "learning_rate": 4.303333333333334e-07, "loss": 0.1061, "num_tokens": 2563835.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 161.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.17474430799484253, "kl": 0.04627208597958088, "learning_rate": 4.3e-07, "loss": 0.0024, "num_tokens": 2564126.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.30064892768859863, "kl": 0.022759624291211367, "learning_rate": 4.296666666666667e-07, "loss": 0.0013, "num_tokens": 2564405.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 161.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05347103998064995, "kl": 0.0051518643740564585, "learning_rate": 4.293333333333333e-07, "loss": 0.0003, "num_tokens": 2564735.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 161.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03452187776565552, "kl": 0.033436816185712814, "learning_rate": 4.29e-07, "loss": 0.0017, "num_tokens": 2565140.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 161.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.09435445070266724, "kl": 0.01236627995967865, "learning_rate": 4.286666666666667e-07, "loss": 0.0006, "num_tokens": 2565479.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 161.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.031507983803749084, "kl": 0.001063937903381884, "learning_rate": 4.2833333333333334e-07, "loss": 0.0001, "num_tokens": 2565755.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 161.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.0394740104675293, "kl": 0.15030419826507568, "learning_rate": 4.28e-07, "loss": 0.051, "num_tokens": 2566131.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 161.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.1364179253578186, "kl": 0.017006624955683947, "learning_rate": 4.2766666666666664e-07, "loss": 0.0008, "num_tokens": 2566422.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 161.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09620498865842819, "kl": 0.0694228857755661, "learning_rate": 4.273333333333334e-07, "loss": 0.0035, "num_tokens": 2566799.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 161.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1384589523077011, "kl": 0.014049086938030086, "learning_rate": 4.2700000000000005e-07, "loss": 0.001, "num_tokens": 2567041.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 161.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.9725539684295654, "kl": 0.13209390453994274, "learning_rate": 4.266666666666667e-07, "loss": 0.2107, "num_tokens": 2567410.0, "reward": 4.875, "reward_std": 5.25, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 5.25, "step": 8721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 161.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01852143369615078, "kl": 0.008005412295460701, "learning_rate": 4.263333333333333e-07, "loss": 0.0004, "num_tokens": 2567684.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 161.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02794245071709156, "kl": 0.0004574150952976197, "learning_rate": 4.26e-07, "loss": 0.0, "num_tokens": 2567927.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 161.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.004467817023396492, "kl": 0.00020632743689930066, "learning_rate": 4.256666666666667e-07, "loss": 0.0, "num_tokens": 2568147.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 161.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.9460231065750122, "kl": 0.1277504339814186, "learning_rate": 4.2533333333333333e-07, "loss": 0.0066, "num_tokens": 2568464.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 161.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02923583798110485, "kl": 0.006599893171369331, "learning_rate": 4.25e-07, "loss": 0.0003, "num_tokens": 2568736.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 161.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08563487976789474, "kl": 0.03685462847352028, "learning_rate": 4.2466666666666663e-07, "loss": 0.0018, "num_tokens": 2569034.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 161.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.07928966730833054, "kl": 0.01721447065938264, "learning_rate": 4.2433333333333336e-07, "loss": 0.0009, "num_tokens": 2569363.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 161.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.042829133570194244, "kl": 0.006660679820924997, "learning_rate": 4.2400000000000004e-07, "loss": 0.0003, "num_tokens": 2569654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 161.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.001952077727764845, "kl": 0.003544352948665619, "learning_rate": 4.2366666666666666e-07, "loss": 0.0002, "num_tokens": 2569890.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 161.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09785790741443634, "kl": 0.0075979826506227255, "learning_rate": 4.2333333333333334e-07, "loss": 0.0004, "num_tokens": 2570166.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 161.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01978292502462864, "kl": 0.0007598996162414551, "learning_rate": 4.2299999999999996e-07, "loss": 0.0, "num_tokens": 2570378.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 161.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06639070063829422, "kl": 0.0010628923773765564, "learning_rate": 4.226666666666667e-07, "loss": 0.0001, "num_tokens": 2570588.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 161.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.1997898668050766, "kl": 0.05732985585927963, "learning_rate": 4.2233333333333337e-07, "loss": 0.0029, "num_tokens": 2570862.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 161.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.005731337238103151, "kl": 0.00033209921093657613, "learning_rate": 4.22e-07, "loss": 0.0, "num_tokens": 2571122.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 161.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.012623523361980915, "kl": 0.00041728348878677934, "learning_rate": 4.2166666666666667e-07, "loss": 0.0, "num_tokens": 2571436.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 161.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.888056755065918, "kl": 0.0346012469381094, "learning_rate": 4.213333333333334e-07, "loss": 0.0994, "num_tokens": 2571731.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 161.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02708945795893669, "kl": 0.1629871353507042, "learning_rate": 4.21e-07, "loss": 0.0081, "num_tokens": 2572040.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 161.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.005142015404999256, "kl": 0.0014799535274505615, "learning_rate": 4.2066666666666665e-07, "loss": 0.0001, "num_tokens": 2572256.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 161.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03366416320204735, "kl": 0.003132957383058965, "learning_rate": 4.203333333333333e-07, "loss": 0.0002, "num_tokens": 2572558.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 161.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.02851632796227932, "kl": 0.0005180761218070984, "learning_rate": 4.2000000000000006e-07, "loss": 0.0, "num_tokens": 2572778.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 161.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.0935473442077637, "kl": 0.384590744972229, "learning_rate": 4.196666666666667e-07, "loss": 0.0389, "num_tokens": 2573083.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 161.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.06690575182437897, "kl": 0.034862760454416275, "learning_rate": 4.1933333333333336e-07, "loss": 0.0017, "num_tokens": 2573464.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 161.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.10117001086473465, "kl": 0.033521804958581924, "learning_rate": 4.19e-07, "loss": 0.0017, "num_tokens": 2573772.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 161.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03286002576351166, "kl": 0.008927557151764631, "learning_rate": 4.1866666666666666e-07, "loss": 0.0004, "num_tokens": 2574126.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 161.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.83968186378479, "kl": 0.0157951662549749, "learning_rate": 4.183333333333334e-07, "loss": 0.0537, "num_tokens": 2574418.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 161.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.20594388246536255, "kl": 0.02551776822656393, "learning_rate": 4.18e-07, "loss": 0.0015, "num_tokens": 2574718.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 162.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.021271243691444397, "kl": 0.011651545763015747, "learning_rate": 4.176666666666667e-07, "loss": 0.0006, "num_tokens": 2574978.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 162.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.006680145859718323, "kl": 0.1638127788901329, "learning_rate": 4.173333333333333e-07, "loss": 0.0082, "num_tokens": 2575286.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 162.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008309624157845974, "kl": 0.000858905230415985, "learning_rate": 4.1700000000000004e-07, "loss": 0.0, "num_tokens": 2575570.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 162.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.08076220005750656, "kl": 0.00554187607485801, "learning_rate": 4.166666666666667e-07, "loss": 0.0003, "num_tokens": 2575846.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 162.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00872249435633421, "kl": 0.00010294913590769283, "learning_rate": 4.1633333333333334e-07, "loss": 0.0, "num_tokens": 2576102.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 162.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03481372073292732, "kl": 0.002255752682685852, "learning_rate": 4.1599999999999997e-07, "loss": 0.0001, "num_tokens": 2576318.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 162.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05893750116229057, "kl": 0.010167696047574282, "learning_rate": 4.1566666666666664e-07, "loss": 0.0005, "num_tokens": 2576594.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 162.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.029528116807341576, "kl": 0.0009664545068517327, "learning_rate": 4.1533333333333337e-07, "loss": 0.0001, "num_tokens": 2576810.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 162.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.020393524318933487, "kl": 0.0013765437761321664, "learning_rate": 4.15e-07, "loss": 0.0001, "num_tokens": 2577044.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 162.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.035633333027362823, "kl": 0.0019097463809885085, "learning_rate": 4.146666666666667e-07, "loss": 0.0001, "num_tokens": 2577340.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 162.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01583975739777088, "kl": 0.019105815328657627, "learning_rate": 4.143333333333333e-07, "loss": 0.001, "num_tokens": 2577618.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 162.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.047380611300468445, "kl": 0.005481145344674587, "learning_rate": 4.1400000000000003e-07, "loss": 0.0003, "num_tokens": 2577946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 162.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08289684355258942, "kl": 0.014533222652971745, "learning_rate": 4.136666666666667e-07, "loss": 0.0008, "num_tokens": 2578234.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 162.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.009104477241635323, "kl": 0.00032294541597366333, "learning_rate": 4.1333333333333333e-07, "loss": 0.0, "num_tokens": 2578478.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 162.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.045783333480358124, "kl": 0.09446677565574646, "learning_rate": 4.13e-07, "loss": 0.0047, "num_tokens": 2578852.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 162.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.34242498874664307, "kl": 0.048494853079319, "learning_rate": 4.1266666666666663e-07, "loss": 0.0024, "num_tokens": 2579150.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 162.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02272961102426052, "kl": 0.00021896511316299438, "learning_rate": 4.1233333333333336e-07, "loss": 0.0, "num_tokens": 2579362.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 162.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03676637262105942, "kl": 0.001942979171872139, "learning_rate": 4.1200000000000004e-07, "loss": 0.0001, "num_tokens": 2579686.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 162.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03820367529988289, "kl": 0.005818691803142428, "learning_rate": 4.1166666666666666e-07, "loss": 0.0003, "num_tokens": 2579976.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 162.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0423758402466774, "kl": 0.009468005038797855, "learning_rate": 4.1133333333333334e-07, "loss": 0.0005, "num_tokens": 2580308.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 162.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.13000838458538055, "kl": 0.03317350219003856, "learning_rate": 4.1100000000000007e-07, "loss": 0.0017, "num_tokens": 2580597.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 162.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018180948682129383, "kl": 0.0035621225833892822, "learning_rate": 4.106666666666667e-07, "loss": 0.0002, "num_tokens": 2580833.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 162.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0681513249874115, "kl": 0.06191878952085972, "learning_rate": 4.103333333333333e-07, "loss": 0.0031, "num_tokens": 2581208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 162.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03283923119306564, "kl": 0.002975815557874739, "learning_rate": 4.1e-07, "loss": 0.0001, "num_tokens": 2581500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 162.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02938288077712059, "kl": 0.006535356555104954, "learning_rate": 4.096666666666667e-07, "loss": 0.0003, "num_tokens": 2581772.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 162.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02980111353099346, "kl": 0.003871294902637601, "learning_rate": 4.0933333333333335e-07, "loss": 0.0002, "num_tokens": 2582108.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 162.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.859311580657959, "kl": 0.588653638958931, "learning_rate": 4.09e-07, "loss": 0.014, "num_tokens": 2582411.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 162.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.11337968707084656, "kl": 0.02585682040080428, "learning_rate": 4.0866666666666665e-07, "loss": 0.0013, "num_tokens": 2582685.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 162.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.144474506378174, "kl": 0.06048629805445671, "learning_rate": 4.083333333333333e-07, "loss": 0.0979, "num_tokens": 2582978.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 162.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06555739790201187, "kl": 0.028324289247393608, "learning_rate": 4.0800000000000005e-07, "loss": 0.0014, "num_tokens": 2583324.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 162.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.09180674701929092, "kl": 0.012973645003512502, "learning_rate": 4.076666666666667e-07, "loss": 0.0006, "num_tokens": 2583596.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 162.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.1345089077949524, "kl": 0.01267294306308031, "learning_rate": 4.0733333333333335e-07, "loss": 0.0007, "num_tokens": 2583853.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 162.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.16680441796779633, "kl": 0.05141289532184601, "learning_rate": 4.07e-07, "loss": 0.0026, "num_tokens": 2584151.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 162.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.054858043789863586, "kl": 0.04051884450018406, "learning_rate": 4.066666666666667e-07, "loss": 0.002, "num_tokens": 2584549.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 162.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02385501191020012, "kl": 0.0009342855628347024, "learning_rate": 4.063333333333334e-07, "loss": 0.0, "num_tokens": 2584865.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 162.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.15966592729091644, "kl": 0.019496652763336897, "learning_rate": 4.06e-07, "loss": 0.001, "num_tokens": 2585155.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 162.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0056667691096663475, "kl": 0.00030512810917571187, "learning_rate": 4.0566666666666663e-07, "loss": 0.0, "num_tokens": 2585415.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 162.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 3.068002843065187e-05, "kl": 2.123415470123291e-06, "learning_rate": 4.053333333333333e-07, "loss": 0.0, "num_tokens": 2585635.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 162.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03649111092090607, "kl": 0.051969584077596664, "learning_rate": 4.0500000000000004e-07, "loss": 0.0026, "num_tokens": 2585967.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 162.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.2128031998872757, "kl": 0.02754961373284459, "learning_rate": 4.0466666666666666e-07, "loss": 0.0016, "num_tokens": 2586292.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 162.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0313015952706337, "kl": 0.03878430649638176, "learning_rate": 4.0433333333333334e-07, "loss": 0.0019, "num_tokens": 2586697.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 162.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.033458877354860306, "kl": 0.0011168313212692738, "learning_rate": 4.0399999999999996e-07, "loss": 0.0001, "num_tokens": 2586962.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 162.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.017862927168607712, "kl": 0.00031419098377227783, "learning_rate": 4.036666666666667e-07, "loss": 0.0, "num_tokens": 2587166.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 162.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.492960214614868, "kl": 0.049693545675836504, "learning_rate": 4.0333333333333337e-07, "loss": 0.0046, "num_tokens": 2587426.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 162.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011306415544822812, "kl": 0.0003612823784351349, "learning_rate": 4.03e-07, "loss": 0.0, "num_tokens": 2587686.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 162.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.004961847327649593, "kl": 0.00025772452499950305, "learning_rate": 4.0266666666666667e-07, "loss": 0.0, "num_tokens": 2587906.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 162.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.021443558856844902, "kl": 0.002437490038573742, "learning_rate": 4.023333333333333e-07, "loss": 0.0001, "num_tokens": 2588218.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 162.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.015272899530827999, "kl": 0.0017460708040744066, "learning_rate": 4.02e-07, "loss": 0.0001, "num_tokens": 2588495.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 162.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05985037237405777, "kl": 0.005212311400100589, "learning_rate": 4.016666666666667e-07, "loss": 0.0002, "num_tokens": 2588759.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 162.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.048719655722379684, "kl": 0.012635144405066967, "learning_rate": 4.0133333333333333e-07, "loss": 0.0006, "num_tokens": 2589020.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 162.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654810294508934, "kl": 0.006403079256415367, "learning_rate": 4.01e-07, "loss": 0.0003, "num_tokens": 2589322.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 162.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.029209721833467484, "kl": 0.005799456033855677, "learning_rate": 4.0066666666666673e-07, "loss": 0.0003, "num_tokens": 2589703.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 162.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.11022688448429108, "kl": 0.06149779632687569, "learning_rate": 4.0033333333333336e-07, "loss": 0.003, "num_tokens": 2590056.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 162.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.058898910880088806, "kl": 0.008033715363126248, "learning_rate": 4e-07, "loss": 0.0004, "num_tokens": 2590384.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 163.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.036348968744277954, "kl": 0.007658802671357989, "learning_rate": 3.9966666666666666e-07, "loss": 0.0003, "num_tokens": 2590677.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 163.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.020849348977208138, "kl": 0.0010426198423374444, "learning_rate": 3.993333333333334e-07, "loss": 0.0001, "num_tokens": 2590948.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 163.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.9002044200897217, "kl": 0.38438936322927475, "learning_rate": 3.99e-07, "loss": 0.0187, "num_tokens": 2591315.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 163.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.19108644127845764, "kl": 0.01586221158504486, "learning_rate": 3.986666666666667e-07, "loss": 0.001, "num_tokens": 2591642.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.035857606679201126, "kl": 0.005893201567232609, "learning_rate": 3.983333333333333e-07, "loss": 0.0003, "num_tokens": 2591920.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 163.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03369211405515671, "kl": 0.004830272751860321, "learning_rate": 3.98e-07, "loss": 0.0002, "num_tokens": 2592218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 163.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.023553693667054176, "kl": 0.0012775935174431652, "learning_rate": 3.976666666666667e-07, "loss": 0.0001, "num_tokens": 2592543.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 163.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.11439812183380127, "kl": 0.036335716024041176, "learning_rate": 3.9733333333333334e-07, "loss": 0.0018, "num_tokens": 2592869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 163.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.001765990979038179, "kl": 0.0035751760005950928, "learning_rate": 3.97e-07, "loss": 0.0002, "num_tokens": 2593105.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 163.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.025588566437363625, "kl": 0.0005647182551911101, "learning_rate": 3.9666666666666665e-07, "loss": 0.0, "num_tokens": 2593329.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 163.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 7.770601769152563e-06, "kl": 1.8998980522155762e-06, "learning_rate": 3.963333333333334e-07, "loss": 0.0, "num_tokens": 2593549.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 163.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1666845828294754, "kl": 0.04413004405796528, "learning_rate": 3.9600000000000005e-07, "loss": 0.0022, "num_tokens": 2593860.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 163.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 8.258151054382324, "kl": 0.0077703624265268445, "learning_rate": 3.956666666666667e-07, "loss": 0.2159, "num_tokens": 2594101.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 163.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04936312511563301, "kl": 0.004371569724753499, "learning_rate": 3.953333333333333e-07, "loss": 0.0003, "num_tokens": 2594320.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 163.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.14638277888298035, "kl": 0.0029341131448745728, "learning_rate": 3.95e-07, "loss": 0.0002, "num_tokens": 2594528.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.017148692160844803, "kl": 0.018850659020245075, "learning_rate": 3.946666666666667e-07, "loss": 0.0011, "num_tokens": 2594806.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 163.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03652740642428398, "kl": 0.0014877381036058068, "learning_rate": 3.9433333333333333e-07, "loss": 0.0001, "num_tokens": 2595132.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.496829032897949, "kl": 0.02485764119774103, "learning_rate": 3.94e-07, "loss": 0.4518, "num_tokens": 2595643.0, "reward": 5.050000190734863, "reward_std": 5.900000095367432, "rewards/reward_combined/mean": 5.050000190734863, "rewards/reward_combined/std": 5.90000057220459, "step": 8819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 163.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.11403830349445343, "kl": 0.010286442004144192, "learning_rate": 3.9366666666666663e-07, "loss": 0.0005, "num_tokens": 2595935.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 163.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.012786706909537315, "kl": 0.26647940278053284, "learning_rate": 3.9333333333333336e-07, "loss": 0.0133, "num_tokens": 2596239.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 163.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.8645275235176086, "kl": 0.10465402714908123, "learning_rate": 3.9300000000000004e-07, "loss": 0.0061, "num_tokens": 2596626.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 163.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05022507905960083, "kl": 0.005733102094382048, "learning_rate": 3.9266666666666666e-07, "loss": 0.0003, "num_tokens": 2596918.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 163.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03051046095788479, "kl": 0.00468259584158659, "learning_rate": 3.9233333333333334e-07, "loss": 0.0002, "num_tokens": 2597209.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 163.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02554413303732872, "kl": 0.0009007410626509227, "learning_rate": 3.9199999999999996e-07, "loss": 0.0, "num_tokens": 2597518.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 163.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.029555946588516235, "kl": 0.00046730042959097773, "learning_rate": 3.916666666666667e-07, "loss": 0.0, "num_tokens": 2597774.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 163.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.9149326682090759, "kl": 0.07168246898800135, "learning_rate": 3.9133333333333337e-07, "loss": 0.0051, "num_tokens": 2598073.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03249691426753998, "kl": 0.0016801682650111616, "learning_rate": 3.91e-07, "loss": 0.0001, "num_tokens": 2598369.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 163.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.007899384014308453, "kl": 0.16168268769979477, "learning_rate": 3.906666666666666e-07, "loss": 0.0081, "num_tokens": 2598678.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 163.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09292007982730865, "kl": 0.01604555733501911, "learning_rate": 3.903333333333334e-07, "loss": 0.0008, "num_tokens": 2599029.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 163.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02616010420024395, "kl": 0.0030356637435033917, "learning_rate": 3.9e-07, "loss": 0.0001, "num_tokens": 2599287.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 163.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.106031894683838, "kl": 0.24775194749236107, "learning_rate": 3.8966666666666665e-07, "loss": 0.1202, "num_tokens": 2599634.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 8832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 163.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.01660798117518425, "kl": 0.00041464615787845105, "learning_rate": 3.893333333333333e-07, "loss": 0.0, "num_tokens": 2599904.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 163.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02174406871199608, "kl": 0.0007948676793603227, "learning_rate": 3.8900000000000006e-07, "loss": 0.0, "num_tokens": 2600120.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 163.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004449574276804924, "kl": 0.00028362125158309937, "learning_rate": 3.886666666666667e-07, "loss": 0.0, "num_tokens": 2600364.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 163.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.08093485236167908, "kl": 0.03179318364709616, "learning_rate": 3.8833333333333336e-07, "loss": 0.0016, "num_tokens": 2600664.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 163.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.12022906541824341, "kl": 0.007909214589744806, "learning_rate": 3.88e-07, "loss": 0.0003, "num_tokens": 2600918.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 163.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.016501858830451965, "kl": 0.005264589213766158, "learning_rate": 3.8766666666666666e-07, "loss": 0.0003, "num_tokens": 2601247.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 163.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02550019510090351, "kl": 0.03701267670840025, "learning_rate": 3.873333333333334e-07, "loss": 0.0018, "num_tokens": 2601652.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06674180179834366, "kl": 0.004657938843593001, "learning_rate": 3.87e-07, "loss": 0.0002, "num_tokens": 2601926.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0161620881408453, "kl": 0.0010424046195112169, "learning_rate": 3.866666666666667e-07, "loss": 0.0001, "num_tokens": 2602208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 163.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026068571023643017, "kl": 0.0002920932893175632, "learning_rate": 3.863333333333333e-07, "loss": 0.0, "num_tokens": 2602470.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004999999888241291, "clip_ratio/low_min": 0.004999999888241291, "clip_ratio/region_mean": 0.004999999888241291, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 163.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.733632802963257, "kl": 0.06945792399346828, "learning_rate": 3.8600000000000004e-07, "loss": 0.3832, "num_tokens": 2602846.0, "reward": 3.049999952316284, "reward_std": 5.499393939971924, "rewards/reward_combined/mean": 3.049999952316284, "rewards/reward_combined/std": 5.499393939971924, "step": 8843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 163.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06728813052177429, "kl": 0.0016703461296856403, "learning_rate": 3.856666666666667e-07, "loss": 0.0001, "num_tokens": 2603116.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 163.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.19651243090629578, "kl": 0.033940818160772324, "learning_rate": 3.8533333333333334e-07, "loss": 0.0017, "num_tokens": 2603427.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03139230236411095, "kl": 0.0015383082791231573, "learning_rate": 3.8499999999999997e-07, "loss": 0.0001, "num_tokens": 2603700.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 163.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.05385978892445564, "kl": 0.006028156960383058, "learning_rate": 3.8466666666666664e-07, "loss": 0.0003, "num_tokens": 2604004.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 163.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.028173333033919334, "kl": 0.00023711472749710083, "learning_rate": 3.8433333333333337e-07, "loss": 0.0, "num_tokens": 2604216.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 163.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.08144671469926834, "kl": 0.0170698466245085, "learning_rate": 3.84e-07, "loss": 0.0009, "num_tokens": 2604502.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 163.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.028200753033161163, "kl": 0.005478203878737986, "learning_rate": 3.836666666666667e-07, "loss": 0.0003, "num_tokens": 2604835.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 163.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.022222740575671196, "kl": 0.011473238468170166, "learning_rate": 3.833333333333333e-07, "loss": 0.0006, "num_tokens": 2605095.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 163.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.003030191408470273, "kl": 0.00035396963357925415, "learning_rate": 3.8300000000000003e-07, "loss": 0.0, "num_tokens": 2605355.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 163.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.029974713921546936, "kl": 0.09397260844707489, "learning_rate": 3.826666666666667e-07, "loss": 0.0047, "num_tokens": 2605728.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 163.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03654379025101662, "kl": 0.05493824928998947, "learning_rate": 3.8233333333333333e-07, "loss": 0.0027, "num_tokens": 2606111.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8854 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.012820512987673283, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 163.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.654800534248352, "kl": 0.010154719115234911, "learning_rate": 3.82e-07, "loss": 0.0013, "num_tokens": 2606434.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 164.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006452057277783751, "kl": 0.0012512527173385024, "learning_rate": 3.8166666666666663e-07, "loss": 0.0001, "num_tokens": 2606714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 164.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.055416300892829895, "kl": 0.0012119606253691018, "learning_rate": 3.8133333333333336e-07, "loss": 0.0001, "num_tokens": 2606927.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 164.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.10294818133115768, "kl": 0.029007730074226856, "learning_rate": 3.8100000000000004e-07, "loss": 0.0016, "num_tokens": 2607237.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 164.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.05681667476892471, "kl": 0.04493633843958378, "learning_rate": 3.8066666666666666e-07, "loss": 0.0022, "num_tokens": 2607643.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 164.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 14.995560646057129, "kl": 0.24177805866929702, "learning_rate": 3.803333333333333e-07, "loss": -0.1447, "num_tokens": 2607874.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 8860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 164.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03134210780262947, "kl": 0.005656247725710273, "learning_rate": 3.8000000000000007e-07, "loss": 0.0003, "num_tokens": 2608142.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 164.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018654853338375688, "kl": 0.003564111888408661, "learning_rate": 3.796666666666667e-07, "loss": 0.0002, "num_tokens": 2608378.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.9843974113464355, "kl": 0.04015055298805237, "learning_rate": 3.793333333333333e-07, "loss": -0.0185, "num_tokens": 2608648.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 164.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08029883354902267, "kl": 0.004360925406217575, "learning_rate": 3.79e-07, "loss": 0.0003, "num_tokens": 2608896.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 164.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.004615867044776678, "kl": 0.00013434389984467998, "learning_rate": 3.786666666666667e-07, "loss": 0.0, "num_tokens": 2609168.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8865 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 164.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.3918683528900146, "kl": 0.051639024168252945, "learning_rate": 3.7833333333333335e-07, "loss": 0.058, "num_tokens": 2609473.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 8866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 164.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03771123290061951, "kl": 0.0054469656315632164, "learning_rate": 3.78e-07, "loss": 0.0003, "num_tokens": 2609800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 164.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04612196981906891, "kl": 0.00045797228813171387, "learning_rate": 3.7766666666666665e-07, "loss": 0.0, "num_tokens": 2610010.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 164.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.10120661556720734, "kl": 0.07140768505632877, "learning_rate": 3.773333333333333e-07, "loss": 0.0036, "num_tokens": 2610384.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 164.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.003295034635812044, "kl": 0.0016836468130350113, "learning_rate": 3.7700000000000005e-07, "loss": 0.0001, "num_tokens": 2610696.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03494046628475189, "kl": 0.0013030902482569218, "learning_rate": 3.766666666666667e-07, "loss": 0.0001, "num_tokens": 2610961.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 164.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10659421235322952, "kl": 0.017446937505155802, "learning_rate": 3.7633333333333335e-07, "loss": 0.0009, "num_tokens": 2611233.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 164.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.047392524778842926, "kl": 0.00733506865799427, "learning_rate": 3.76e-07, "loss": 0.0003, "num_tokens": 2611491.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 164.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.012479480355978012, "kl": 0.26650701463222504, "learning_rate": 3.756666666666667e-07, "loss": 0.0133, "num_tokens": 2611795.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8874 }, { "clip_ratio/high_max": 0.006756756920367479, "clip_ratio/high_mean": 0.006756756920367479, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006756756920367479, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 164.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.3536951541900635, "kl": 0.13813966512680054, "learning_rate": 3.753333333333334e-07, "loss": 0.0897, "num_tokens": 2612184.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 164.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.02731388621032238, "kl": 0.005873343674466014, "learning_rate": 3.75e-07, "loss": 0.0003, "num_tokens": 2612472.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 164.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006941261235624552, "kl": 0.16369856894016266, "learning_rate": 3.7466666666666663e-07, "loss": 0.0082, "num_tokens": 2612780.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 164.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.13367219269275665, "kl": 0.02340683527290821, "learning_rate": 3.7433333333333336e-07, "loss": 0.0012, "num_tokens": 2613071.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 164.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004712092340923846, "kl": 0.0012449536588974297, "learning_rate": 3.74e-07, "loss": 0.0001, "num_tokens": 2613351.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 164.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.09926892817020416, "kl": 0.004716712632216513, "learning_rate": 3.7366666666666666e-07, "loss": 0.0002, "num_tokens": 2613622.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 164.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.012036683969199657, "kl": 0.000649003341095522, "learning_rate": 3.7333333333333334e-07, "loss": 0.0, "num_tokens": 2613930.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 164.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 3.629360071499832e-05, "kl": 2.175569534301758e-06, "learning_rate": 3.73e-07, "loss": 0.0, "num_tokens": 2614150.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 164.5, "frac_reward_zero_std": 1.0, "grad_norm": 2.0771701335906982, "kl": 0.15470555424690247, "learning_rate": 3.726666666666667e-07, "loss": 0.0103, "num_tokens": 2614368.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 164.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.618802785873413, "kl": 0.7924874499440193, "learning_rate": 3.7233333333333337e-07, "loss": 0.01, "num_tokens": 2614668.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.31049013137817383, "kl": 0.04288405901752412, "learning_rate": 3.72e-07, "loss": 0.0025, "num_tokens": 2614964.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 164.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02175561711192131, "kl": 0.011518369428813457, "learning_rate": 3.7166666666666667e-07, "loss": 0.0006, "num_tokens": 2615224.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 164.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06942158192396164, "kl": 0.02773621492087841, "learning_rate": 3.7133333333333335e-07, "loss": 0.0014, "num_tokens": 2615566.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 164.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0695282593369484, "kl": 0.006201328476890922, "learning_rate": 3.71e-07, "loss": 0.0003, "num_tokens": 2615904.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07365050911903381, "kl": 0.01597495237365365, "learning_rate": 3.706666666666667e-07, "loss": 0.0008, "num_tokens": 2616190.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 164.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.003071509301662445, "kl": 0.00012495517148636281, "learning_rate": 3.7033333333333333e-07, "loss": 0.0, "num_tokens": 2616410.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 164.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.020218145102262497, "kl": 0.0008706642256584018, "learning_rate": 3.7e-07, "loss": 0.0, "num_tokens": 2616690.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 164.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.027547791600227356, "kl": 0.0019597470527514815, "learning_rate": 3.696666666666667e-07, "loss": 0.0001, "num_tokens": 2616950.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 164.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09631503373384476, "kl": 0.02029264811426401, "learning_rate": 3.6933333333333336e-07, "loss": 0.0011, "num_tokens": 2617238.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 164.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.198466777801514, "kl": 0.059917932376265526, "learning_rate": 3.69e-07, "loss": 0.1073, "num_tokens": 2617556.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 8894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 164.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.567140579223633, "kl": 0.014268872095271945, "learning_rate": 3.686666666666667e-07, "loss": 0.1812, "num_tokens": 2617925.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 164.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.017244728282094002, "kl": 0.005691462545655668, "learning_rate": 3.6833333333333334e-07, "loss": 0.0003, "num_tokens": 2618251.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.00633375346660614, "kl": 0.0007037085015326738, "learning_rate": 3.68e-07, "loss": 0.0, "num_tokens": 2618535.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 164.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.001179036684334278, "kl": 0.0004479549825191498, "learning_rate": 3.676666666666667e-07, "loss": 0.0, "num_tokens": 2618795.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07126681506633759, "kl": 0.005395851098001003, "learning_rate": 3.673333333333333e-07, "loss": 0.0003, "num_tokens": 2619093.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 164.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0403926819562912, "kl": 0.008564659859985113, "learning_rate": 3.67e-07, "loss": 0.0004, "num_tokens": 2619421.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 164.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.1954672932624817, "kl": 0.013664277270436287, "learning_rate": 3.6666666666666667e-07, "loss": 0.0008, "num_tokens": 2619642.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 164.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.7249271869659424, "kl": 0.10196920670568943, "learning_rate": 3.6633333333333334e-07, "loss": 0.0665, "num_tokens": 2619997.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 164.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.6210243701934814, "kl": 0.06056203693151474, "learning_rate": 3.66e-07, "loss": -0.1532, "num_tokens": 2620346.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 164.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.116154193878174, "kl": 0.12774553894996643, "learning_rate": 3.656666666666667e-07, "loss": 0.0808, "num_tokens": 2620727.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 8904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 164.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.012762450613081455, "kl": 0.00012174844596302137, "learning_rate": 3.653333333333333e-07, "loss": 0.0, "num_tokens": 2620983.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 164.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04183805361390114, "kl": 0.003286240331362933, "learning_rate": 3.6500000000000005e-07, "loss": 0.0002, "num_tokens": 2621249.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 164.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02714330516755581, "kl": 0.0014906234864611179, "learning_rate": 3.646666666666667e-07, "loss": 0.0001, "num_tokens": 2621576.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 164.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.025136498734354973, "kl": 0.001960758410859853, "learning_rate": 3.643333333333333e-07, "loss": 0.0001, "num_tokens": 2621872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 164.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03514412045478821, "kl": 0.0031722472049295902, "learning_rate": 3.6400000000000003e-07, "loss": 0.0002, "num_tokens": 2622167.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.282050848007202, "kl": 0.18228881061077118, "learning_rate": 3.6366666666666665e-07, "loss": 0.0256, "num_tokens": 2622456.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 165.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.175658226013184, "kl": 0.17125089094042778, "learning_rate": 3.6333333333333333e-07, "loss": -0.2149, "num_tokens": 2622766.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 8911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 165.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.13480964303016663, "kl": 0.014778361655771732, "learning_rate": 3.63e-07, "loss": 0.0007, "num_tokens": 2623033.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 165.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.00694236671552062, "kl": 0.16172915697097778, "learning_rate": 3.626666666666667e-07, "loss": 0.0081, "num_tokens": 2623342.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 13.388091087341309, "kl": 0.02993581583723426, "learning_rate": 3.6233333333333336e-07, "loss": 0.1351, "num_tokens": 2623625.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.2529824674129486, "kl": 0.028030208311975002, "learning_rate": 3.6200000000000004e-07, "loss": 0.0016, "num_tokens": 2623925.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 165.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06724633276462555, "kl": 0.009230048395693302, "learning_rate": 3.6166666666666666e-07, "loss": 0.0005, "num_tokens": 2624261.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 165.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.06674270331859589, "kl": 0.01059711305424571, "learning_rate": 3.6133333333333334e-07, "loss": 0.0005, "num_tokens": 2624553.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 165.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04066598042845726, "kl": 0.002640068531036377, "learning_rate": 3.61e-07, "loss": 0.0001, "num_tokens": 2624813.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 165.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.07767357677221298, "kl": 0.04681067913770676, "learning_rate": 3.6066666666666664e-07, "loss": 0.0023, "num_tokens": 2625217.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 165.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.008731626905500889, "kl": 0.0003216303884983063, "learning_rate": 3.6033333333333337e-07, "loss": 0.0, "num_tokens": 2625461.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 165.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.012828600592911243, "kl": 0.00016302168660331517, "learning_rate": 3.6e-07, "loss": 0.0, "num_tokens": 2625717.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 165.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.041724614799022675, "kl": 0.013417571317404509, "learning_rate": 3.5966666666666667e-07, "loss": 0.0007, "num_tokens": 2626057.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 165.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.024062277749180794, "kl": 0.0016484694206155837, "learning_rate": 3.5933333333333335e-07, "loss": 0.0001, "num_tokens": 2626329.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 165.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.013716497458517551, "kl": 0.26632802188396454, "learning_rate": 3.59e-07, "loss": 0.0133, "num_tokens": 2626633.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 165.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04783390089869499, "kl": 0.005601051088888198, "learning_rate": 3.5866666666666665e-07, "loss": 0.0002, "num_tokens": 2626947.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 165.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.47431907057762146, "kl": 0.05029179051052779, "learning_rate": 3.583333333333334e-07, "loss": 0.0027, "num_tokens": 2627229.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 165.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04834480956196785, "kl": 0.025549123995006084, "learning_rate": 3.58e-07, "loss": 0.0012, "num_tokens": 2627575.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 165.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 6.4278645515441895, "kl": 0.011338358279317617, "learning_rate": 3.576666666666667e-07, "loss": 0.072, "num_tokens": 2627868.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 165.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020694900304079056, "kl": 1.8887221813201904e-05, "learning_rate": 3.5733333333333336e-07, "loss": 0.0, "num_tokens": 2628080.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0922953188419342, "kl": 0.0055123771307989955, "learning_rate": 3.57e-07, "loss": 0.0003, "num_tokens": 2628353.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 165.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04353654384613037, "kl": 0.016483448445796967, "learning_rate": 3.5666666666666666e-07, "loss": 0.0008, "num_tokens": 2628658.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.09266700595617294, "kl": 0.039644776843488216, "learning_rate": 3.5633333333333333e-07, "loss": 0.002, "num_tokens": 2628950.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 165.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.023396484553813934, "kl": 0.0011999238631688058, "learning_rate": 3.56e-07, "loss": 0.0001, "num_tokens": 2629228.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.06482604891061783, "kl": 0.010835106950253248, "learning_rate": 3.556666666666667e-07, "loss": 0.0005, "num_tokens": 2629546.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 165.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.009873195551335812, "kl": 0.0727309063076973, "learning_rate": 3.5533333333333337e-07, "loss": 0.0037, "num_tokens": 2629916.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 165.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035874859895557165, "kl": 0.00014355778694152832, "learning_rate": 3.55e-07, "loss": 0.0, "num_tokens": 2630136.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 165.5, "frac_reward_zero_std": 1.0, "grad_norm": 7.541063678218052e-05, "kl": 2.339482307434082e-06, "learning_rate": 3.546666666666667e-07, "loss": 0.0, "num_tokens": 2630356.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 165.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06692533195018768, "kl": 0.0063443309627473354, "learning_rate": 3.5433333333333334e-07, "loss": 0.0003, "num_tokens": 2630654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 165.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033889184705913067, "kl": 0.0003566415543900803, "learning_rate": 3.5399999999999997e-07, "loss": 0.0, "num_tokens": 2630889.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 165.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.07119540870189667, "kl": 0.0018159648170694709, "learning_rate": 3.536666666666667e-07, "loss": 0.0001, "num_tokens": 2631099.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 80.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 80.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 165.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.9200172424316406, "kl": 0.018058304209262133, "learning_rate": 3.533333333333333e-07, "loss": 0.4624, "num_tokens": 2631641.0, "reward": 6.300000190734863, "reward_std": 2.4000000953674316, "rewards/reward_combined/mean": 6.300000190734863, "rewards/reward_combined/std": 2.3999998569488525, "step": 8941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 165.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.20624007284641266, "kl": 0.03857899643480778, "learning_rate": 3.53e-07, "loss": 0.0022, "num_tokens": 2632025.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 165.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.040238816291093826, "kl": 0.048973362892866135, "learning_rate": 3.526666666666667e-07, "loss": 0.0024, "num_tokens": 2632367.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 165.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005650758743286, "kl": 0.007193847734015435, "learning_rate": 3.5233333333333335e-07, "loss": 0.0004, "num_tokens": 2632654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05416218191385269, "kl": 0.0069457958452403545, "learning_rate": 3.5200000000000003e-07, "loss": 0.0003, "num_tokens": 2632946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 165.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.021506691351532936, "kl": 0.0026105040742550045, "learning_rate": 3.516666666666667e-07, "loss": 0.0001, "num_tokens": 2633216.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 12.81379222869873, "kl": 2.445858425926417, "learning_rate": 3.5133333333333333e-07, "loss": 0.164, "num_tokens": 2633498.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 8947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 165.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01908211223781109, "kl": 0.0008574156381655484, "learning_rate": 3.51e-07, "loss": 0.0, "num_tokens": 2633821.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 165.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05590381845831871, "kl": 0.013555833138525486, "learning_rate": 3.506666666666667e-07, "loss": 0.0008, "num_tokens": 2634095.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 165.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0392288975417614, "kl": 0.003294804133474827, "learning_rate": 3.503333333333333e-07, "loss": 0.0002, "num_tokens": 2634407.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 165.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.014749441295862198, "kl": 0.0035708002251340076, "learning_rate": 3.5000000000000004e-07, "loss": 0.0001, "num_tokens": 2634667.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 165.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.002090618945658207, "kl": 0.00046838074922561646, "learning_rate": 3.4966666666666666e-07, "loss": 0.0, "num_tokens": 2634927.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 165.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.001910749590024352, "kl": 0.003546901047229767, "learning_rate": 3.4933333333333334e-07, "loss": 0.0002, "num_tokens": 2635163.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 165.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01774788461625576, "kl": 0.008146382169798017, "learning_rate": 3.49e-07, "loss": 0.0004, "num_tokens": 2635489.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 165.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.028992164880037308, "kl": 0.001073649211321026, "learning_rate": 3.486666666666667e-07, "loss": 0.0001, "num_tokens": 2635797.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 165.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.017621587961912155, "kl": 0.000513613224029541, "learning_rate": 3.483333333333333e-07, "loss": 0.0, "num_tokens": 2636009.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 165.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.24761372804641724, "kl": 0.07010496780276299, "learning_rate": 3.4800000000000005e-07, "loss": 0.0037, "num_tokens": 2636374.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 165.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.10244481265544891, "kl": 0.012026742100715637, "learning_rate": 3.4766666666666667e-07, "loss": 0.0006, "num_tokens": 2636615.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 165.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.783552408218384, "kl": 0.659954097121954, "learning_rate": 3.4733333333333335e-07, "loss": 0.0648, "num_tokens": 2636876.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 165.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.7953356504440308, "kl": 0.10929779708385468, "learning_rate": 3.47e-07, "loss": 0.0544, "num_tokens": 2637257.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 165.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 1.8628766536712646, "kl": 0.1270136758685112, "learning_rate": 3.4666666666666665e-07, "loss": -0.1134, "num_tokens": 2637607.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 165.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.11938206106424332, "kl": 0.02930101566016674, "learning_rate": 3.463333333333333e-07, "loss": 0.0016, "num_tokens": 2637936.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 165.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03477209806442261, "kl": 0.003304382844362408, "learning_rate": 3.46e-07, "loss": 0.0002, "num_tokens": 2638226.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 166.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.009456836618483067, "kl": 0.00017060488607967272, "learning_rate": 3.456666666666667e-07, "loss": 0.0, "num_tokens": 2638496.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 166.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01614745892584324, "kl": 0.00039904813093016855, "learning_rate": 3.4533333333333336e-07, "loss": 0.0, "num_tokens": 2638766.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 166.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07821919023990631, "kl": 0.017789161298424006, "learning_rate": 3.4500000000000003e-07, "loss": 0.0009, "num_tokens": 2639052.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 166.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.351588487625122, "kl": 0.21713664382696152, "learning_rate": 3.4466666666666666e-07, "loss": 0.0701, "num_tokens": 2639391.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 8967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11390016227960587, "kl": 0.008551203645765781, "learning_rate": 3.443333333333334e-07, "loss": 0.0004, "num_tokens": 2639674.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 166.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04560841992497444, "kl": 0.0010352313402108848, "learning_rate": 3.44e-07, "loss": 0.0001, "num_tokens": 2639887.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 166.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.018355349078774452, "kl": 0.00022319257914205082, "learning_rate": 3.4366666666666663e-07, "loss": 0.0, "num_tokens": 2640143.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 166.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.08908860385417938, "kl": 0.01137493271380663, "learning_rate": 3.4333333333333336e-07, "loss": 0.0006, "num_tokens": 2640473.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 166.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01968403533101082, "kl": 0.0007319689611904323, "learning_rate": 3.43e-07, "loss": 0.0, "num_tokens": 2640793.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 166.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 7.014540672302246, "kl": 0.027785656973719597, "learning_rate": 3.4266666666666666e-07, "loss": 0.1351, "num_tokens": 2641060.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 8973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 166.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010650206357240677, "kl": 0.0007479600608348846, "learning_rate": 3.4233333333333334e-07, "loss": 0.0, "num_tokens": 2641346.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 166.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03259936720132828, "kl": 0.002014004574448336, "learning_rate": 3.42e-07, "loss": 0.0001, "num_tokens": 2641658.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 166.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.027884047478437424, "kl": 0.002687966451048851, "learning_rate": 3.4166666666666664e-07, "loss": 0.0002, "num_tokens": 2641885.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 166.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.08815071731805801, "kl": 0.002794707892462611, "learning_rate": 3.4133333333333337e-07, "loss": 0.0002, "num_tokens": 2642097.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 166.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.060912683606147766, "kl": 0.016964766662567854, "learning_rate": 3.41e-07, "loss": 0.0009, "num_tokens": 2642381.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 166.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06269055604934692, "kl": 0.008007180411368608, "learning_rate": 3.4066666666666667e-07, "loss": 0.0004, "num_tokens": 2642712.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 166.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0740547701716423, "kl": 0.0038766830693930387, "learning_rate": 3.4033333333333335e-07, "loss": 0.0002, "num_tokens": 2642988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 166.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.021885257214307785, "kl": 0.09568078815937042, "learning_rate": 3.4e-07, "loss": 0.0048, "num_tokens": 2643360.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 166.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.037093162536621094, "kl": 0.012666834518313408, "learning_rate": 3.396666666666667e-07, "loss": 0.0006, "num_tokens": 2643663.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0044247787445783615, "clip_ratio/low_min": 0.0044247787445783615, "clip_ratio/region_mean": 0.0044247787445783615, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 166.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.776369333267212, "kl": 0.045760709792375565, "learning_rate": 3.3933333333333333e-07, "loss": 0.3509, "num_tokens": 2644088.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 166.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 6.3478779792785645, "kl": 0.24215000867843628, "learning_rate": 3.39e-07, "loss": -0.0877, "num_tokens": 2644372.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 166.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.8391432762145996, "kl": 0.09275639988481998, "learning_rate": 3.386666666666667e-07, "loss": 0.0384, "num_tokens": 2644742.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 166.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.004364797845482826, "kl": 0.00026813894510269165, "learning_rate": 3.3833333333333336e-07, "loss": 0.0, "num_tokens": 2644986.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.013198398053646088, "kl": 0.0007692250364925712, "learning_rate": 3.38e-07, "loss": 0.0, "num_tokens": 2645254.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 166.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0594061017036438, "kl": 0.007339270319789648, "learning_rate": 3.376666666666667e-07, "loss": 0.0003, "num_tokens": 2645549.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 166.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.383419990539551, "kl": 0.0505806072615087, "learning_rate": 3.3733333333333334e-07, "loss": 0.0497, "num_tokens": 2645846.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 166.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.022364147007465363, "kl": 0.004210047423839569, "learning_rate": 3.37e-07, "loss": 0.0002, "num_tokens": 2646136.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 166.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.026250915601849556, "kl": 0.005221094004809856, "learning_rate": 3.366666666666667e-07, "loss": 0.0003, "num_tokens": 2646424.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 166.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 9.70198917388916, "kl": 0.020322605734691024, "learning_rate": 3.363333333333333e-07, "loss": 0.1312, "num_tokens": 2646691.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 166.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0409669429063797, "kl": 0.060269128531217575, "learning_rate": 3.36e-07, "loss": 0.003, "num_tokens": 2647063.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 166.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.7887897491455078, "kl": 0.11246787011623383, "learning_rate": 3.3566666666666667e-07, "loss": 0.0064, "num_tokens": 2647407.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 166.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020631065126508474, "kl": 0.0035220980644226074, "learning_rate": 3.3533333333333334e-07, "loss": 0.0002, "num_tokens": 2647643.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 166.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0824362188577652, "kl": 0.032608781941235065, "learning_rate": 3.35e-07, "loss": 0.0018, "num_tokens": 2648031.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 166.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.15901196002960205, "kl": 0.011640347132924944, "learning_rate": 3.346666666666667e-07, "loss": 0.0007, "num_tokens": 2648306.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 166.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.09696836769580841, "kl": 0.018158008344471455, "learning_rate": 3.343333333333333e-07, "loss": 0.0009, "num_tokens": 2648629.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 166.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04881034046411514, "kl": 0.0047300157602876425, "learning_rate": 3.3400000000000005e-07, "loss": 0.0002, "num_tokens": 2648943.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 166.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 4.4322829246521, "kl": 0.0021250458667054772, "learning_rate": 3.336666666666667e-07, "loss": -0.0354, "num_tokens": 2649225.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 9000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.047932110726833344, "kl": 0.0020593113731592894, "learning_rate": 3.333333333333333e-07, "loss": 0.0001, "num_tokens": 2649525.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 166.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.012752959504723549, "kl": 0.26652343571186066, "learning_rate": 3.3300000000000003e-07, "loss": 0.0133, "num_tokens": 2649829.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 166.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011644826008705422, "kl": 2.6226043701171875e-06, "learning_rate": 3.3266666666666665e-07, "loss": 0.0, "num_tokens": 2650049.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 166.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03149415925145149, "kl": 0.16143415868282318, "learning_rate": 3.3233333333333333e-07, "loss": 0.0081, "num_tokens": 2650359.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 166.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.005765592213720083, "kl": 0.00034224688715767115, "learning_rate": 3.32e-07, "loss": 0.0, "num_tokens": 2650619.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 166.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.023634400218725204, "kl": 0.04280726984143257, "learning_rate": 3.316666666666667e-07, "loss": 0.0021, "num_tokens": 2651023.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 166.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05035879462957382, "kl": 0.0018155035795643926, "learning_rate": 3.313333333333333e-07, "loss": 0.0001, "num_tokens": 2651256.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.007576784119009972, "kl": 0.0014271500403992832, "learning_rate": 3.3100000000000004e-07, "loss": 0.0001, "num_tokens": 2651533.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9008 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 166.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 4.5049614906311035, "kl": 0.24596751108765602, "learning_rate": 3.3066666666666666e-07, "loss": 0.0226, "num_tokens": 2651832.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 166.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.396388292312622, "kl": 0.02755722193978727, "learning_rate": 3.3033333333333334e-07, "loss": -0.0629, "num_tokens": 2652132.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 166.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0171686839312315, "kl": 0.0127438441850245, "learning_rate": 3.3e-07, "loss": 0.0006, "num_tokens": 2652392.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 166.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.12279807031154633, "kl": 0.017405035556294024, "learning_rate": 3.2966666666666664e-07, "loss": 0.0009, "num_tokens": 2652686.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 166.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047224522568285465, "kl": 0.00045462697744369507, "learning_rate": 3.2933333333333337e-07, "loss": 0.0, "num_tokens": 2652946.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 166.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.003492511110380292, "kl": 0.00013096928159939125, "learning_rate": 3.29e-07, "loss": 0.0, "num_tokens": 2653166.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 166.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.026813959702849388, "kl": 0.000827362178824842, "learning_rate": 3.2866666666666667e-07, "loss": 0.0, "num_tokens": 2653382.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 166.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1722688227891922, "kl": 0.020217320881783962, "learning_rate": 3.2833333333333335e-07, "loss": 0.001, "num_tokens": 2653655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.013961921446025372, "kl": 0.01194792427122593, "learning_rate": 3.28e-07, "loss": 0.0007, "num_tokens": 2653929.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 167.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.23052072525024414, "kl": 0.024137687403708696, "learning_rate": 3.2766666666666665e-07, "loss": 0.0011, "num_tokens": 2654268.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 167.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02728920988738537, "kl": 0.0011898605152964592, "learning_rate": 3.273333333333334e-07, "loss": 0.0001, "num_tokens": 2654560.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 167.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.350365161895752, "kl": 0.12469993159174919, "learning_rate": 3.27e-07, "loss": -0.1039, "num_tokens": 2654927.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 167.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.044807445257902145, "kl": 0.032632600516080856, "learning_rate": 3.266666666666667e-07, "loss": 0.0016, "num_tokens": 2655231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 167.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.013114286586642265, "kl": 0.2664356380701065, "learning_rate": 3.2633333333333336e-07, "loss": 0.0133, "num_tokens": 2655535.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 167.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.031016511842608452, "kl": 0.0012240736396051943, "learning_rate": 3.26e-07, "loss": 0.0001, "num_tokens": 2655769.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 167.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.006127322558313608, "kl": 0.0004086077242391184, "learning_rate": 3.2566666666666666e-07, "loss": 0.0, "num_tokens": 2656029.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 167.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.217532634735107, "kl": 0.07824867963790894, "learning_rate": 3.2533333333333333e-07, "loss": 0.0085, "num_tokens": 2656352.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 9025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 167.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03403575345873833, "kl": 0.002095251576974988, "learning_rate": 3.25e-07, "loss": 0.0001, "num_tokens": 2656650.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9026 }, { "clip_ratio/high_max": 0.0017667844658717513, "clip_ratio/high_mean": 0.0017667844658717513, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017667844658717513, "completion_length": 88.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 88.5, "completions/mean_terminated_length": 32.66666793823242, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 167.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.064964532852173, "kl": 0.18354413658380508, "learning_rate": 3.246666666666667e-07, "loss": 0.3913, "num_tokens": 2657220.0, "reward": 2.875, "reward_std": 4.75, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 4.75, "step": 9027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 167.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.13474050164222717, "kl": 0.007781546883052215, "learning_rate": 3.2433333333333337e-07, "loss": 0.0004, "num_tokens": 2657490.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 167.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0582008995115757, "kl": 0.009410197380930185, "learning_rate": 3.24e-07, "loss": 0.0004, "num_tokens": 2657782.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 167.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06137774884700775, "kl": 0.006446503335610032, "learning_rate": 3.236666666666667e-07, "loss": 0.0003, "num_tokens": 2658055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 167.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.009382962249219418, "kl": 0.0012312799808569252, "learning_rate": 3.2333333333333334e-07, "loss": 0.0001, "num_tokens": 2658315.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 167.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.045821577310562134, "kl": 0.006673014722764492, "learning_rate": 3.2299999999999997e-07, "loss": 0.0004, "num_tokens": 2658637.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 167.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.3322956562042236, "kl": 0.007913945824839175, "learning_rate": 3.226666666666667e-07, "loss": -0.0016, "num_tokens": 2658969.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 167.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.01629730314016342, "kl": 0.004390859045088291, "learning_rate": 3.223333333333333e-07, "loss": 0.0002, "num_tokens": 2659251.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 167.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.9045982360839844, "kl": 0.01428734790533781, "learning_rate": 3.22e-07, "loss": -0.0053, "num_tokens": 2659550.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 9035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 167.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02386181429028511, "kl": 0.04284735023975372, "learning_rate": 3.216666666666667e-07, "loss": 0.0021, "num_tokens": 2659954.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 167.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05300671234726906, "kl": 0.025482898578047752, "learning_rate": 3.2133333333333335e-07, "loss": 0.0013, "num_tokens": 2660337.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 167.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.5597089529037476, "kl": 0.06891607865691185, "learning_rate": 3.21e-07, "loss": 0.0037, "num_tokens": 2660670.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 167.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04626183584332466, "kl": 0.0034574606688693166, "learning_rate": 3.206666666666667e-07, "loss": 0.0002, "num_tokens": 2660935.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 167.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.005932462867349386, "kl": 0.16389703750610352, "learning_rate": 3.2033333333333333e-07, "loss": 0.0082, "num_tokens": 2661243.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 167.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.014954416081309319, "kl": 0.000551818564417772, "learning_rate": 3.2e-07, "loss": 0.0, "num_tokens": 2661562.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 167.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02260546386241913, "kl": 0.09551291167736053, "learning_rate": 3.196666666666667e-07, "loss": 0.0048, "num_tokens": 2661934.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 167.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09751478582620621, "kl": 0.0205126847140491, "learning_rate": 3.193333333333333e-07, "loss": 0.0012, "num_tokens": 2662220.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 167.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.013813708908855915, "kl": 0.0005569718778133392, "learning_rate": 3.1900000000000004e-07, "loss": 0.0, "num_tokens": 2662480.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 167.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.001637967536225915, "kl": 9.602904174244031e-05, "learning_rate": 3.1866666666666666e-07, "loss": 0.0, "num_tokens": 2662736.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 167.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.13162198662757874, "kl": 0.027825507801026106, "learning_rate": 3.1833333333333334e-07, "loss": 0.0015, "num_tokens": 2663058.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 167.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09343654662370682, "kl": 0.0032493870239704847, "learning_rate": 3.18e-07, "loss": 0.0002, "num_tokens": 2663288.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 167.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 7.311008453369141, "kl": 0.025621794629842043, "learning_rate": 3.176666666666667e-07, "loss": 0.0451, "num_tokens": 2663597.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 167.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.003543616272509098, "kl": 8.435795461991802e-05, "learning_rate": 3.173333333333333e-07, "loss": 0.0, "num_tokens": 2663869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 167.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.05114828422665596, "kl": 0.006182837300002575, "learning_rate": 3.1700000000000005e-07, "loss": 0.0003, "num_tokens": 2664137.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 167.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.4044647216796875, "kl": 0.09645114466547966, "learning_rate": 3.1666666666666667e-07, "loss": 0.1221, "num_tokens": 2664488.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 167.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.019824182614684105, "kl": 0.0007872451096773148, "learning_rate": 3.1633333333333335e-07, "loss": 0.0, "num_tokens": 2664800.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 167.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.024676613509655, "kl": 0.0011943488207180053, "learning_rate": 3.16e-07, "loss": 0.0001, "num_tokens": 2665066.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 167.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048749735578894615, "kl": 0.0005660813767462969, "learning_rate": 3.1566666666666665e-07, "loss": 0.0, "num_tokens": 2665350.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 167.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.017648622393608093, "kl": 0.0005122125148773193, "learning_rate": 3.153333333333333e-07, "loss": 0.0, "num_tokens": 2665562.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 167.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.30617356300354, "kl": 0.0075485792476683855, "learning_rate": 3.15e-07, "loss": 0.0947, "num_tokens": 2665908.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 167.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06386909633874893, "kl": 0.033163134939968586, "learning_rate": 3.146666666666667e-07, "loss": 0.0017, "num_tokens": 2666180.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 167.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004050778807140887, "kl": 0.0012495687697082758, "learning_rate": 3.1433333333333336e-07, "loss": 0.0001, "num_tokens": 2666460.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 167.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015987787628546357, "kl": 3.56137752532959e-06, "learning_rate": 3.1400000000000003e-07, "loss": 0.0, "num_tokens": 2666680.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 167.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05841468274593353, "kl": 0.004395393072627485, "learning_rate": 3.1366666666666666e-07, "loss": 0.0002, "num_tokens": 2666980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 167.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02497151494026184, "kl": 0.0011480699395178817, "learning_rate": 3.133333333333334e-07, "loss": 0.0001, "num_tokens": 2667289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 167.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.003614462213590741, "kl": 0.0001339554801234044, "learning_rate": 3.13e-07, "loss": 0.0, "num_tokens": 2667509.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 167.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.09565560519695282, "kl": 0.02800754737108946, "learning_rate": 3.1266666666666663e-07, "loss": 0.0015, "num_tokens": 2667797.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 167.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.6577396392822266, "kl": 0.01220496604219079, "learning_rate": 3.1233333333333336e-07, "loss": 0.0013, "num_tokens": 2668088.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 167.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.04903611168265343, "kl": 0.0012084171175956726, "learning_rate": 3.12e-07, "loss": 0.0001, "num_tokens": 2668298.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 167.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04346535727381706, "kl": 0.02796619851142168, "learning_rate": 3.1166666666666666e-07, "loss": 0.0012, "num_tokens": 2668646.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 167.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.08011380583047867, "kl": 0.00722011923789978, "learning_rate": 3.1133333333333334e-07, "loss": 0.0004, "num_tokens": 2668908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 167.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.09081485867500305, "kl": 0.00112876296043396, "learning_rate": 3.11e-07, "loss": 0.0001, "num_tokens": 2669120.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 167.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.001803085790015757, "kl": 0.003569498658180237, "learning_rate": 3.1066666666666664e-07, "loss": 0.0002, "num_tokens": 2669356.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 167.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.5184289216995239, "kl": 0.03959103426313959, "learning_rate": 3.1033333333333337e-07, "loss": 0.0024, "num_tokens": 2669671.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 167.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06740084290504456, "kl": 0.0050421059131622314, "learning_rate": 3.1e-07, "loss": 0.0003, "num_tokens": 2669915.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 168.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.017893768846988678, "kl": 0.01260658772662282, "learning_rate": 3.0966666666666667e-07, "loss": 0.0006, "num_tokens": 2670175.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 168.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.982280731201172, "kl": 0.02883831597864628, "learning_rate": 3.0933333333333335e-07, "loss": 0.1784, "num_tokens": 2670522.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 168.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.12652860581874847, "kl": 0.009109247475862503, "learning_rate": 3.09e-07, "loss": 0.0005, "num_tokens": 2670766.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 168.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03454748913645744, "kl": 0.0017968494212254882, "learning_rate": 3.086666666666667e-07, "loss": 0.0001, "num_tokens": 2671000.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 168.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.008845863863825798, "kl": 0.00047231465578079224, "learning_rate": 3.0833333333333333e-07, "loss": 0.0, "num_tokens": 2671212.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 168.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.13079652190208435, "kl": 0.07701052911579609, "learning_rate": 3.08e-07, "loss": 0.0041, "num_tokens": 2671627.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 168.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 2.709502696990967, "kl": 0.27142958249896765, "learning_rate": 3.076666666666667e-07, "loss": 0.0145, "num_tokens": 2671901.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 168.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.09337548911571503, "kl": 0.004717143252491951, "learning_rate": 3.0733333333333336e-07, "loss": 0.0003, "num_tokens": 2672128.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 168.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018021861324086785, "kl": 0.003566339612007141, "learning_rate": 3.07e-07, "loss": 0.0002, "num_tokens": 2672364.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 168.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.029145289212465286, "kl": 0.0039941276190802455, "learning_rate": 3.066666666666667e-07, "loss": 0.0002, "num_tokens": 2672632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 168.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.461592674255371, "kl": 0.16257892549037933, "learning_rate": 3.0633333333333334e-07, "loss": -0.042, "num_tokens": 2672995.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 168.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022829370573163033, "kl": 0.0014499109238386154, "learning_rate": 3.06e-07, "loss": 0.0001, "num_tokens": 2673307.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 168.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.021950727328658104, "kl": 0.0012737291399389505, "learning_rate": 3.056666666666667e-07, "loss": 0.0001, "num_tokens": 2673597.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 168.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.05887385457754135, "kl": 0.004343248903751373, "learning_rate": 3.053333333333333e-07, "loss": 0.0002, "num_tokens": 2673873.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 168.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 3.792623829212971e-05, "kl": 2.2426247596740723e-06, "learning_rate": 3.05e-07, "loss": 0.0, "num_tokens": 2674093.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 168.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06482207030057907, "kl": 0.0340889748185873, "learning_rate": 3.0466666666666667e-07, "loss": 0.0017, "num_tokens": 2674393.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 168.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1505323052406311, "kl": 0.018018494360148907, "learning_rate": 3.0433333333333335e-07, "loss": 0.0009, "num_tokens": 2674730.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 168.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06641481816768646, "kl": 0.018404459580779076, "learning_rate": 3.04e-07, "loss": 0.0009, "num_tokens": 2675002.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 168.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03631287440657616, "kl": 0.020896779373288155, "learning_rate": 3.036666666666667e-07, "loss": 0.0011, "num_tokens": 2675275.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 168.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.020260119810700417, "kl": 0.004041685722768307, "learning_rate": 3.033333333333333e-07, "loss": 0.0002, "num_tokens": 2675565.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 168.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.033487025648355484, "kl": 0.006246573058888316, "learning_rate": 3.0300000000000005e-07, "loss": 0.0003, "num_tokens": 2675908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 168.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01766207069158554, "kl": 0.002122357487678528, "learning_rate": 3.026666666666667e-07, "loss": 0.0001, "num_tokens": 2676125.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 168.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.06749314069747925, "kl": 0.05411577969789505, "learning_rate": 3.023333333333333e-07, "loss": 0.0027, "num_tokens": 2676494.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 168.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071084643714129925, "kl": 0.1596687063574791, "learning_rate": 3.0200000000000003e-07, "loss": 0.008, "num_tokens": 2676804.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 168.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.058292195200920105, "kl": 0.014927006792277098, "learning_rate": 3.0166666666666665e-07, "loss": 0.0008, "num_tokens": 2677090.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 168.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.012844592332839966, "kl": 0.07325214520096779, "learning_rate": 3.0133333333333333e-07, "loss": 0.0037, "num_tokens": 2677460.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 168.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04029526561498642, "kl": 0.03668023273348808, "learning_rate": 3.01e-07, "loss": 0.002, "num_tokens": 2677835.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 168.5, "frac_reward_zero_std": 0.0, "grad_norm": 7.292428970336914, "kl": 0.02206529534305446, "learning_rate": 3.006666666666667e-07, "loss": 0.112, "num_tokens": 2678127.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 168.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.2068705558776855, "kl": 0.08380471915006638, "learning_rate": 3.003333333333333e-07, "loss": 0.0748, "num_tokens": 2678465.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 168.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.028131909668445587, "kl": 0.005242582177743316, "learning_rate": 3.0000000000000004e-07, "loss": 0.0003, "num_tokens": 2678733.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 168.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 5.5013275146484375, "kl": 0.12188638374209404, "learning_rate": 2.9966666666666666e-07, "loss": 0.2227, "num_tokens": 2678982.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 168.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.035127200186252594, "kl": 0.005436467472463846, "learning_rate": 2.9933333333333334e-07, "loss": 0.0002, "num_tokens": 2679302.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 168.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.017706643790006638, "kl": 0.01266731508076191, "learning_rate": 2.99e-07, "loss": 0.0006, "num_tokens": 2679562.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 168.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.262886047363281, "kl": 0.06739437009673566, "learning_rate": 2.9866666666666664e-07, "loss": 0.008, "num_tokens": 2679838.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 168.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 1.27232027053833, "kl": 0.20473309140652418, "learning_rate": 2.9833333333333337e-07, "loss": 0.0108, "num_tokens": 2680166.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 168.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.20640438795089722, "kl": 0.05750578595325351, "learning_rate": 2.98e-07, "loss": 0.0018, "num_tokens": 2680529.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 168.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.056533508002758026, "kl": 0.03394610807299614, "learning_rate": 2.9766666666666667e-07, "loss": 0.0017, "num_tokens": 2680826.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 168.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03272896260023117, "kl": 0.0019635865464806557, "learning_rate": 2.9733333333333335e-07, "loss": 0.0001, "num_tokens": 2681140.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 168.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1420769840478897, "kl": 0.02583632292225957, "learning_rate": 2.97e-07, "loss": 0.0014, "num_tokens": 2681447.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 168.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.4552088975906372, "kl": 0.0613188095157966, "learning_rate": 2.9666666666666665e-07, "loss": 0.0031, "num_tokens": 2681756.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 168.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.7372629642486572, "kl": 0.11151259989128448, "learning_rate": 2.963333333333334e-07, "loss": 0.0058, "num_tokens": 2682017.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 168.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.004043793771415949, "kl": 0.0003586001694202423, "learning_rate": 2.96e-07, "loss": 0.0, "num_tokens": 2682277.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 168.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.1454576551914215, "kl": 0.019353345967829227, "learning_rate": 2.956666666666667e-07, "loss": 0.001, "num_tokens": 2682555.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 168.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0465005598962307, "kl": 0.006566162686794996, "learning_rate": 2.9533333333333336e-07, "loss": 0.0003, "num_tokens": 2682854.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 168.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.43169453740119934, "kl": 0.06690465216524899, "learning_rate": 2.95e-07, "loss": 0.0034, "num_tokens": 2683152.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 168.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.11691901832818985, "kl": 0.01078946515917778, "learning_rate": 2.9466666666666666e-07, "loss": 0.0006, "num_tokens": 2683487.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 168.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.015972651541233063, "kl": 0.0004284679889678955, "learning_rate": 2.9433333333333334e-07, "loss": 0.0, "num_tokens": 2683699.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 168.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.007666092831641436, "kl": 0.00038790104736108333, "learning_rate": 2.94e-07, "loss": 0.0, "num_tokens": 2684018.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 168.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.013481322675943375, "kl": 0.2663469910621643, "learning_rate": 2.936666666666667e-07, "loss": 0.0133, "num_tokens": 2684322.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 168.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.05399410054087639, "kl": 0.0011048614978790283, "learning_rate": 2.9333333333333337e-07, "loss": 0.0001, "num_tokens": 2684530.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 168.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.043435242027044296, "kl": 0.0033705367241054773, "learning_rate": 2.93e-07, "loss": 0.0002, "num_tokens": 2684804.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 168.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007465688977390528, "kl": 0.00134140788577497, "learning_rate": 2.926666666666667e-07, "loss": 0.0001, "num_tokens": 2685081.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 168.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09124170243740082, "kl": 0.012778798583894968, "learning_rate": 2.9233333333333334e-07, "loss": 0.0006, "num_tokens": 2685373.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 168.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0337088517844677, "kl": 0.05377374589443207, "learning_rate": 2.9199999999999997e-07, "loss": 0.0027, "num_tokens": 2685719.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 169.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035882391966879368, "kl": 6.958246376598254e-05, "learning_rate": 2.916666666666667e-07, "loss": 0.0, "num_tokens": 2685975.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 169.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.018008166924118996, "kl": 0.0048581333830952644, "learning_rate": 2.913333333333333e-07, "loss": 0.0002, "num_tokens": 2686264.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 169.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02369351126253605, "kl": 0.04278396815061569, "learning_rate": 2.91e-07, "loss": 0.0021, "num_tokens": 2686668.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 169.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.9191031455993652, "kl": 0.3539789766073227, "learning_rate": 2.906666666666667e-07, "loss": 0.0359, "num_tokens": 2686966.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 9129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015591923147439957, "kl": 3.56137752532959e-06, "learning_rate": 2.9033333333333335e-07, "loss": 0.0, "num_tokens": 2687186.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 169.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 1.5556657314300537, "kl": 0.06467830576002598, "learning_rate": 2.9e-07, "loss": 0.0041, "num_tokens": 2687541.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.021280251443386078, "kl": 0.000614169239270268, "learning_rate": 2.896666666666667e-07, "loss": 0.0, "num_tokens": 2687760.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 169.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.05633282661438, "kl": 0.06055983155965805, "learning_rate": 2.8933333333333333e-07, "loss": -0.0029, "num_tokens": 2688141.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.17023494839668274, "kl": 0.010327961761504412, "learning_rate": 2.89e-07, "loss": 0.0007, "num_tokens": 2688374.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 169.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.022016847506165504, "kl": 0.09566148370504379, "learning_rate": 2.886666666666667e-07, "loss": 0.0048, "num_tokens": 2688746.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 169.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03291894122958183, "kl": 0.0050873481668531895, "learning_rate": 2.883333333333333e-07, "loss": 0.0002, "num_tokens": 2689073.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 169.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.182126045227051, "kl": 0.06510461936704814, "learning_rate": 2.8800000000000004e-07, "loss": 0.1526, "num_tokens": 2689365.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 169.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.012140309438109398, "kl": 0.0004731234657811001, "learning_rate": 2.8766666666666666e-07, "loss": 0.0, "num_tokens": 2689677.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 169.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.006200368050485849, "kl": 0.16181518882513046, "learning_rate": 2.8733333333333334e-07, "loss": 0.0081, "num_tokens": 2689986.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 169.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.059274476021528244, "kl": 0.014872962608933449, "learning_rate": 2.87e-07, "loss": 0.0008, "num_tokens": 2690338.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 169.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.4888216257095337, "kl": 0.03381285443902016, "learning_rate": 2.866666666666667e-07, "loss": 0.0022, "num_tokens": 2690605.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 169.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.725061893463135, "kl": 0.14928995189256966, "learning_rate": 2.863333333333333e-07, "loss": 0.027, "num_tokens": 2690882.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 9142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 169.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.007302282378077507, "kl": 0.0011429578298702836, "learning_rate": 2.8600000000000005e-07, "loss": 0.0001, "num_tokens": 2691142.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 169.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.054937105625867844, "kl": 0.012293716194108129, "learning_rate": 2.8566666666666667e-07, "loss": 0.0006, "num_tokens": 2691426.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 169.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.007608338259160519, "kl": 0.0014296133304014802, "learning_rate": 2.8533333333333335e-07, "loss": 0.0001, "num_tokens": 2691703.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 169.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.036619335412979126, "kl": 0.00887847039848566, "learning_rate": 2.85e-07, "loss": 0.0005, "num_tokens": 2692046.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 169.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.048186130821704865, "kl": 0.003134794533252716, "learning_rate": 2.8466666666666665e-07, "loss": 0.0002, "num_tokens": 2692290.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.001961027504876256, "kl": 0.0035253167152404785, "learning_rate": 2.843333333333333e-07, "loss": 0.0002, "num_tokens": 2692526.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 169.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.026200201362371445, "kl": 0.005162051471415907, "learning_rate": 2.84e-07, "loss": 0.0003, "num_tokens": 2692814.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 169.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.085477314889431, "kl": 0.042702607810497284, "learning_rate": 2.836666666666667e-07, "loss": 0.0021, "num_tokens": 2693114.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 169.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06544790416955948, "kl": 0.017514828126877546, "learning_rate": 2.8333333333333336e-07, "loss": 0.0009, "num_tokens": 2693404.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 169.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.026688124984502792, "kl": 0.01073854649439454, "learning_rate": 2.8300000000000003e-07, "loss": 0.0005, "num_tokens": 2693665.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 169.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.325022220611572, "kl": 0.027866336633451283, "learning_rate": 2.8266666666666666e-07, "loss": 0.0377, "num_tokens": 2693934.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 169.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09739240258932114, "kl": 0.05873563513159752, "learning_rate": 2.8233333333333333e-07, "loss": 0.0029, "num_tokens": 2694311.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 169.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.3076964020729065, "kl": 0.04121387377381325, "learning_rate": 2.82e-07, "loss": 0.0023, "num_tokens": 2694618.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 169.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.026420464739203453, "kl": 0.0006961002945899963, "learning_rate": 2.8166666666666663e-07, "loss": 0.0, "num_tokens": 2694824.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 169.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.08795703947544098, "kl": 0.032087234780192375, "learning_rate": 2.8133333333333336e-07, "loss": 0.0016, "num_tokens": 2695165.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 169.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.055294718593358994, "kl": 0.023009028751403093, "learning_rate": 2.81e-07, "loss": 0.0011, "num_tokens": 2695450.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 169.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.014163954183459282, "kl": 0.26626069843769073, "learning_rate": 2.8066666666666667e-07, "loss": 0.0133, "num_tokens": 2695754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 169.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.011316646821796894, "kl": 0.0006442245794460177, "learning_rate": 2.8033333333333334e-07, "loss": 0.0, "num_tokens": 2696022.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 169.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.68283748626709, "kl": 0.023975116200745106, "learning_rate": 2.8e-07, "loss": 0.0245, "num_tokens": 2696354.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 169.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.04598274081945419, "kl": 0.0012741684913635254, "learning_rate": 2.7966666666666664e-07, "loss": 0.0001, "num_tokens": 2696570.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 169.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08580449223518372, "kl": 0.015314336866140366, "learning_rate": 2.7933333333333337e-07, "loss": 0.0008, "num_tokens": 2696862.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 169.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.7087838053703308, "kl": 0.06382860301528126, "learning_rate": 2.79e-07, "loss": 0.0035, "num_tokens": 2697149.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 169.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.024151522666215897, "kl": 0.012360613327473402, "learning_rate": 2.786666666666667e-07, "loss": 0.0007, "num_tokens": 2697421.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 169.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.11619424819946289, "kl": 0.02923139650374651, "learning_rate": 2.7833333333333335e-07, "loss": 0.0015, "num_tokens": 2697800.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 53.75, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 169.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 2.5626919269561768, "kl": 0.07971706241369247, "learning_rate": 2.78e-07, "loss": 0.3251, "num_tokens": 2698251.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 169.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.053862739354372025, "kl": 0.003571488428860903, "learning_rate": 2.776666666666667e-07, "loss": 0.0002, "num_tokens": 2698522.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 169.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021490822546184063, "kl": 0.0003683630784507841, "learning_rate": 2.7733333333333333e-07, "loss": 0.0, "num_tokens": 2698758.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 169.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08697015792131424, "kl": 0.032572224736213684, "learning_rate": 2.77e-07, "loss": 0.0016, "num_tokens": 2699068.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 169.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02715768851339817, "kl": 0.0014278620365075767, "learning_rate": 2.766666666666667e-07, "loss": 0.0001, "num_tokens": 2699390.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 169.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.030639655888080597, "kl": 0.0005421936511993408, "learning_rate": 2.7633333333333336e-07, "loss": 0.0, "num_tokens": 2699646.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 169.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.08038154244422913, "kl": 0.005284496815875173, "learning_rate": 2.76e-07, "loss": 0.0002, "num_tokens": 2699918.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 169.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0639193058013916, "kl": 0.005531937116757035, "learning_rate": 2.756666666666667e-07, "loss": 0.0003, "num_tokens": 2700216.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04740872606635094, "kl": 0.0005561858415603638, "learning_rate": 2.7533333333333334e-07, "loss": 0.0, "num_tokens": 2700430.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 169.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05378052592277527, "kl": 0.0035414602607488632, "learning_rate": 2.7499999999999996e-07, "loss": 0.0002, "num_tokens": 2700742.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 169.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.07247386872768402, "kl": 0.015047638677060604, "learning_rate": 2.746666666666667e-07, "loss": 0.0007, "num_tokens": 2701066.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 169.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.031231416389346123, "kl": 0.0068037977907806635, "learning_rate": 2.743333333333333e-07, "loss": 0.0003, "num_tokens": 2701322.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 169.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.002868650946766138, "kl": 7.636298687430099e-05, "learning_rate": 2.74e-07, "loss": 0.0, "num_tokens": 2701592.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 170.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.044867340475320816, "kl": 0.00415095454081893, "learning_rate": 2.7366666666666667e-07, "loss": 0.0002, "num_tokens": 2701892.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 170.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0876043364405632, "kl": 0.02977957483381033, "learning_rate": 2.7333333333333335e-07, "loss": 0.0015, "num_tokens": 2702233.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02200915478169918, "kl": 0.007784502813592553, "learning_rate": 2.73e-07, "loss": 0.0004, "num_tokens": 2702507.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 170.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.014696286991238594, "kl": 0.00040178000926971436, "learning_rate": 2.726666666666667e-07, "loss": 0.0, "num_tokens": 2702719.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046244912664406, "kl": 0.0012522083707153797, "learning_rate": 2.723333333333333e-07, "loss": 0.0001, "num_tokens": 2702999.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04747818782925606, "kl": 0.007726686540991068, "learning_rate": 2.72e-07, "loss": 0.0004, "num_tokens": 2703292.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 170.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0423741415143013, "kl": 0.001446351408958435, "learning_rate": 2.716666666666667e-07, "loss": 0.0001, "num_tokens": 2703552.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 170.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 7.266155989782419e-06, "kl": 1.8998980522155762e-06, "learning_rate": 2.713333333333333e-07, "loss": 0.0, "num_tokens": 2703772.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 170.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.004043549299240112, "kl": 0.00039796531200408936, "learning_rate": 2.7100000000000003e-07, "loss": 0.0, "num_tokens": 2703978.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.0425496101379395, "kl": 0.06336348480544984, "learning_rate": 2.7066666666666666e-07, "loss": 0.0193, "num_tokens": 2704247.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.016158513724803925, "kl": 0.0005108352343086153, "learning_rate": 2.7033333333333333e-07, "loss": 0.0, "num_tokens": 2704527.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 170.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05484510585665703, "kl": 0.060698799788951874, "learning_rate": 2.7e-07, "loss": 0.003, "num_tokens": 2704898.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 170.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.030827559530735016, "kl": 0.0036003990098834038, "learning_rate": 2.696666666666667e-07, "loss": 0.0002, "num_tokens": 2705200.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.017668500542640686, "kl": 0.0007510337454732507, "learning_rate": 2.693333333333333e-07, "loss": 0.0, "num_tokens": 2705526.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 170.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 4.536442756652832, "kl": 0.04639521427452564, "learning_rate": 2.6900000000000004e-07, "loss": 0.0518, "num_tokens": 2705840.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 170.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.018822981044650078, "kl": 0.0020066049182787538, "learning_rate": 2.6866666666666666e-07, "loss": 0.0001, "num_tokens": 2706110.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 170.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004678748548030853, "kl": 0.00013982994278194383, "learning_rate": 2.6833333333333334e-07, "loss": 0.0, "num_tokens": 2706332.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 170.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.16967931389808655, "kl": 0.021710258326493204, "learning_rate": 2.68e-07, "loss": 0.0016, "num_tokens": 2706609.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 170.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.018137242645025253, "kl": 0.0023055775091052055, "learning_rate": 2.6766666666666664e-07, "loss": 0.0001, "num_tokens": 2706921.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 170.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051785982213914394, "kl": 0.0002482414129190147, "learning_rate": 2.6733333333333337e-07, "loss": 0.0, "num_tokens": 2707181.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 4.575043678283691, "kl": 0.2401425465941429, "learning_rate": 2.67e-07, "loss": -0.0599, "num_tokens": 2707480.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 9200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 170.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.10838541388511658, "kl": 0.01027031010016799, "learning_rate": 2.6666666666666667e-07, "loss": 0.0005, "num_tokens": 2707750.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04622650146484375, "kl": 0.0029889001743867993, "learning_rate": 2.6633333333333335e-07, "loss": 0.0001, "num_tokens": 2708034.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 170.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.80014967918396, "kl": 0.03519435413181782, "learning_rate": 2.66e-07, "loss": 0.0717, "num_tokens": 2708363.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.040308836847543716, "kl": 0.004975801100954413, "learning_rate": 2.6566666666666665e-07, "loss": 0.0002, "num_tokens": 2708647.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 170.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.053678594529628754, "kl": 0.0022880625911056995, "learning_rate": 2.653333333333334e-07, "loss": 0.0001, "num_tokens": 2708881.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 170.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.2773194313049316, "kl": 0.03563913959078491, "learning_rate": 2.65e-07, "loss": 0.0971, "num_tokens": 2709187.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 170.5, "frac_reward_zero_std": 1.0, "grad_norm": 2.3573484420776367, "kl": 0.6315638422966003, "learning_rate": 2.6466666666666663e-07, "loss": 0.0316, "num_tokens": 2709491.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 170.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09753599762916565, "kl": 0.019341569393873215, "learning_rate": 2.6433333333333336e-07, "loss": 0.001, "num_tokens": 2709827.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.41413000226020813, "kl": 0.030696485773660243, "learning_rate": 2.64e-07, "loss": 0.0016, "num_tokens": 2710113.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 170.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.010697687044739723, "kl": 0.0006071812531445175, "learning_rate": 2.6366666666666666e-07, "loss": 0.0, "num_tokens": 2710424.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 170.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09459370374679565, "kl": 0.007220255443826318, "learning_rate": 2.6333333333333334e-07, "loss": 0.0004, "num_tokens": 2710761.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 170.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 4.147076606750488, "kl": 0.02942005218937993, "learning_rate": 2.63e-07, "loss": -0.093, "num_tokens": 2711095.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 170.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0780385211110115, "kl": 0.022949498146772385, "learning_rate": 2.626666666666667e-07, "loss": 0.0012, "num_tokens": 2711396.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.06539977341890335, "kl": 0.007835661293938756, "learning_rate": 2.6233333333333337e-07, "loss": 0.0004, "num_tokens": 2711670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 170.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.042042363435029984, "kl": 0.0020368692348711193, "learning_rate": 2.62e-07, "loss": 0.0001, "num_tokens": 2711942.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 170.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.028872370719909668, "kl": 0.0005261659680400044, "learning_rate": 2.6166666666666667e-07, "loss": 0.0, "num_tokens": 2712155.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 170.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.41078078746795654, "kl": 0.06827171891927719, "learning_rate": 2.6133333333333334e-07, "loss": 0.0033, "num_tokens": 2712517.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 170.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018712634919211268, "kl": 0.0035548508167266846, "learning_rate": 2.6099999999999997e-07, "loss": 0.0002, "num_tokens": 2712753.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 170.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.004391403403133154, "kl": 0.0014798715710639954, "learning_rate": 2.606666666666667e-07, "loss": 0.0001, "num_tokens": 2712969.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 170.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.026176869869232178, "kl": 0.05682331882417202, "learning_rate": 2.603333333333333e-07, "loss": 0.0028, "num_tokens": 2713301.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 170.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.027988692745566368, "kl": 0.09581217914819717, "learning_rate": 2.6e-07, "loss": 0.0048, "num_tokens": 2713673.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 170.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.2307140827178955, "kl": 0.03173012437764555, "learning_rate": 2.596666666666667e-07, "loss": 0.1443, "num_tokens": 2714016.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 170.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.7073192596435547, "kl": 0.10055483132600784, "learning_rate": 2.5933333333333335e-07, "loss": -0.0142, "num_tokens": 2714379.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 170.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0860510841012001, "kl": 0.005248288391157985, "learning_rate": 2.59e-07, "loss": 0.0003, "num_tokens": 2714686.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.032722461968660355, "kl": 0.0030742096714675426, "learning_rate": 2.586666666666667e-07, "loss": 0.0002, "num_tokens": 2714974.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9225 }, { "clip_ratio/high_max": 0.01923076994717121, "clip_ratio/high_mean": 0.01923076994717121, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01923076994717121, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 170.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.562826633453369, "kl": 0.07377597441154649, "learning_rate": 2.5833333333333333e-07, "loss": 0.0402, "num_tokens": 2715248.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 170.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.004207914229482412, "kl": 0.00027719512581825256, "learning_rate": 2.58e-07, "loss": 0.0, "num_tokens": 2715492.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 170.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01793079450726509, "kl": 0.012614931911230087, "learning_rate": 2.576666666666667e-07, "loss": 0.0006, "num_tokens": 2715752.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 170.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03436558321118355, "kl": 0.01032313471660018, "learning_rate": 2.573333333333333e-07, "loss": 0.0005, "num_tokens": 2716045.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 170.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.001848552841693163, "kl": 5.713402970286552e-05, "learning_rate": 2.5700000000000004e-07, "loss": 0.0, "num_tokens": 2716301.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 170.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.008658486418426037, "kl": 0.15746158361434937, "learning_rate": 2.5666666666666666e-07, "loss": 0.0079, "num_tokens": 2716612.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 170.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.24864767491817474, "kl": 0.02767011895775795, "learning_rate": 2.5633333333333334e-07, "loss": 0.0014, "num_tokens": 2716873.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 170.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.20802640914917, "kl": 0.0335781816393137, "learning_rate": 2.56e-07, "loss": 0.045, "num_tokens": 2717284.0, "reward": 2.25, "reward_std": 1.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.5, "step": 9233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06469318270683289, "kl": 0.005688388191629201, "learning_rate": 2.556666666666667e-07, "loss": 0.0003, "num_tokens": 2717582.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 171.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.017351731657981873, "kl": 0.002413789741694927, "learning_rate": 2.553333333333333e-07, "loss": 0.0001, "num_tokens": 2717894.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 171.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.013852477073669434, "kl": 0.26613885164260864, "learning_rate": 2.5500000000000005e-07, "loss": 0.0133, "num_tokens": 2718198.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 171.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.06120152398943901, "kl": 0.015362829202786088, "learning_rate": 2.5466666666666667e-07, "loss": 0.0008, "num_tokens": 2718484.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03666425496339798, "kl": 0.005595547321718186, "learning_rate": 2.543333333333333e-07, "loss": 0.0003, "num_tokens": 2718780.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 171.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.029687346890568733, "kl": 0.0023615637328475714, "learning_rate": 2.54e-07, "loss": 0.0001, "num_tokens": 2719062.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 171.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.018211375921964645, "kl": 0.012510097585618496, "learning_rate": 2.5366666666666665e-07, "loss": 0.0006, "num_tokens": 2719322.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 171.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.10099165141582489, "kl": 0.005628938903100789, "learning_rate": 2.533333333333333e-07, "loss": 0.0003, "num_tokens": 2719559.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 171.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06273996084928513, "kl": 0.061120785772800446, "learning_rate": 2.53e-07, "loss": 0.0031, "num_tokens": 2719930.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.018921660259366035, "kl": 0.011749022640287876, "learning_rate": 2.526666666666667e-07, "loss": 0.0007, "num_tokens": 2720204.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 171.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.019094351679086685, "kl": 0.0008994002782856114, "learning_rate": 2.5233333333333336e-07, "loss": 0.0, "num_tokens": 2720513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 171.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1848917156457901, "kl": 0.1712716445326805, "learning_rate": 2.5200000000000003e-07, "loss": 0.0086, "num_tokens": 2720828.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 171.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03211478516459465, "kl": 0.005016311421059072, "learning_rate": 2.5166666666666666e-07, "loss": 0.0003, "num_tokens": 2721118.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 171.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.3765474259853363, "kl": 0.0724665205925703, "learning_rate": 2.5133333333333333e-07, "loss": 0.0031, "num_tokens": 2721463.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 171.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04638824984431267, "kl": 0.0047746936907060444, "learning_rate": 2.51e-07, "loss": 0.0002, "num_tokens": 2721790.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 171.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.850092887878418, "kl": 0.05242218263447285, "learning_rate": 2.5066666666666663e-07, "loss": 0.0393, "num_tokens": 2722199.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 9249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 171.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.025840256363153458, "kl": 0.09576661139726639, "learning_rate": 2.5033333333333336e-07, "loss": 0.0048, "num_tokens": 2722571.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 171.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01032637245953083, "kl": 0.0002431079774396494, "learning_rate": 2.5e-07, "loss": 0.0, "num_tokens": 2722835.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 171.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06110481172800064, "kl": 0.009592322399839759, "learning_rate": 2.4966666666666667e-07, "loss": 0.0005, "num_tokens": 2723128.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 171.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01683252677321434, "kl": 0.000428263854701072, "learning_rate": 2.4933333333333334e-07, "loss": 0.0, "num_tokens": 2723398.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 171.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.05815865099430084, "kl": 0.0010071337219415, "learning_rate": 2.49e-07, "loss": 0.0001, "num_tokens": 2723611.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 171.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.008881403133273125, "kl": 0.0002932734787464142, "learning_rate": 2.4866666666666664e-07, "loss": 0.0, "num_tokens": 2723855.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 171.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.034985411912202835, "kl": 0.011005035368725657, "learning_rate": 2.4833333333333337e-07, "loss": 0.0006, "num_tokens": 2724178.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 171.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.012652016244828701, "kl": 0.0001588374379934976, "learning_rate": 2.48e-07, "loss": 0.0, "num_tokens": 2724434.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 171.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.06394850462675095, "kl": 0.0523146316409111, "learning_rate": 2.476666666666667e-07, "loss": 0.0026, "num_tokens": 2724772.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 171.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.047183990478516, "kl": 0.006762051023542881, "learning_rate": 2.4733333333333335e-07, "loss": 0.0933, "num_tokens": 2725051.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 171.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.6576875448226929, "kl": 0.06636911258101463, "learning_rate": 2.47e-07, "loss": 0.0035, "num_tokens": 2725360.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 171.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027701316867023706, "kl": 0.00012449621863197535, "learning_rate": 2.466666666666667e-07, "loss": 0.0, "num_tokens": 2725580.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 171.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03198763728141785, "kl": 0.005203839216846973, "learning_rate": 2.4633333333333333e-07, "loss": 0.0003, "num_tokens": 2725905.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 171.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04723777621984482, "kl": 0.0014811183791607618, "learning_rate": 2.46e-07, "loss": 0.0001, "num_tokens": 2726117.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 171.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 1.7707160711288452, "kl": 0.14118522591888905, "learning_rate": 2.456666666666667e-07, "loss": -0.0743, "num_tokens": 2726479.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 171.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09439795464277267, "kl": 0.030508296564221382, "learning_rate": 2.4533333333333336e-07, "loss": 0.0015, "num_tokens": 2726811.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 171.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.12211757898330688, "kl": 0.007215000689029694, "learning_rate": 2.45e-07, "loss": 0.0004, "num_tokens": 2727032.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 171.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 7.447613716125488, "kl": 0.06826008018106222, "learning_rate": 2.446666666666667e-07, "loss": -0.026, "num_tokens": 2727322.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 171.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.46034368872642517, "kl": 0.06429689936339855, "learning_rate": 2.4433333333333334e-07, "loss": 0.0038, "num_tokens": 2727612.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 171.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03711140900850296, "kl": 0.0030567603826057166, "learning_rate": 2.4399999999999996e-07, "loss": 0.0002, "num_tokens": 2727915.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 171.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.06139790639281273, "kl": 0.005101605493109673, "learning_rate": 2.436666666666667e-07, "loss": 0.0003, "num_tokens": 2728179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 171.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.004615632351487875, "kl": 0.0003375775704625994, "learning_rate": 2.433333333333333e-07, "loss": 0.0, "num_tokens": 2728496.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 171.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.799159526824951, "kl": 0.17506680742371827, "learning_rate": 2.43e-07, "loss": 0.2766, "num_tokens": 2728791.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.255412757396698, "kl": 0.021144443744560704, "learning_rate": 2.4266666666666667e-07, "loss": 0.0014, "num_tokens": 2729079.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 171.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.36697056889533997, "kl": 0.024108875542879105, "learning_rate": 2.4233333333333335e-07, "loss": 0.0016, "num_tokens": 2729346.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 171.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 5.15786075592041, "kl": 0.6303621083498001, "learning_rate": 2.42e-07, "loss": 0.068, "num_tokens": 2729711.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 9275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 171.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.029184000566601753, "kl": 0.005045531550422311, "learning_rate": 2.416666666666667e-07, "loss": 0.0003, "num_tokens": 2729993.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 171.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02143452689051628, "kl": 0.007529604714363813, "learning_rate": 2.413333333333333e-07, "loss": 0.0004, "num_tokens": 2730287.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 171.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.12209755182266235, "kl": 0.007590380730107427, "learning_rate": 2.41e-07, "loss": 0.0005, "num_tokens": 2730574.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 171.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.07218904048204422, "kl": 0.016672561643645167, "learning_rate": 2.406666666666667e-07, "loss": 0.001, "num_tokens": 2730856.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 171.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09310668706893921, "kl": 0.009345972328446805, "learning_rate": 2.403333333333333e-07, "loss": 0.0005, "num_tokens": 2731133.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 171.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.34855785965919495, "kl": 0.08765563741326332, "learning_rate": 2.4000000000000003e-07, "loss": 0.0047, "num_tokens": 2731447.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 171.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.025222888216376305, "kl": 0.006499540293589234, "learning_rate": 2.3966666666666666e-07, "loss": 0.0003, "num_tokens": 2731788.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 171.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017833812162280083, "kl": 0.0035702288150787354, "learning_rate": 2.3933333333333333e-07, "loss": 0.0002, "num_tokens": 2732024.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 171.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.017632756382226944, "kl": 0.0005254000425338745, "learning_rate": 2.39e-07, "loss": 0.0, "num_tokens": 2732236.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 171.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 16.372608184814453, "kl": 0.10826432891190052, "learning_rate": 2.386666666666667e-07, "loss": 0.0565, "num_tokens": 2732541.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 171.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 7.463217480108142e-05, "kl": 2.205371856689453e-06, "learning_rate": 2.3833333333333334e-07, "loss": 0.0, "num_tokens": 2732761.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 171.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.062440160661935806, "kl": 0.010460796765983105, "learning_rate": 2.3800000000000001e-07, "loss": 0.0006, "num_tokens": 2733091.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.011701135896146297, "kl": 0.0007465392409358174, "learning_rate": 2.3766666666666666e-07, "loss": 0.0, "num_tokens": 2733351.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 172.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03472888097167015, "kl": 0.005237925099208951, "learning_rate": 2.3733333333333331e-07, "loss": 0.0003, "num_tokens": 2733647.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 172.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.19062164425849915, "kl": 0.021663925144821405, "learning_rate": 2.3700000000000002e-07, "loss": 0.001, "num_tokens": 2733943.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 172.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02929820492863655, "kl": 0.022063929587602615, "learning_rate": 2.3666666666666667e-07, "loss": 0.0011, "num_tokens": 2734218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 172.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011094835645053536, "kl": 2.3543834686279297e-06, "learning_rate": 2.3633333333333335e-07, "loss": 0.0, "num_tokens": 2734438.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 172.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419840291142464, "kl": 0.01258205994963646, "learning_rate": 2.36e-07, "loss": 0.0006, "num_tokens": 2734773.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 172.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.017215872183442116, "kl": 0.0005730873963329941, "learning_rate": 2.3566666666666667e-07, "loss": 0.0, "num_tokens": 2735079.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 172.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.2060709148645401, "kl": 0.018995384220033884, "learning_rate": 2.3533333333333332e-07, "loss": 0.001, "num_tokens": 2735397.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 172.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03913000226020813, "kl": 0.01191510446369648, "learning_rate": 2.3500000000000003e-07, "loss": 0.0006, "num_tokens": 2735693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 172.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.09895136207342148, "kl": 0.03540264815092087, "learning_rate": 2.3466666666666668e-07, "loss": 0.0018, "num_tokens": 2736017.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 172.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10670503973960876, "kl": 0.007496505975723267, "learning_rate": 2.3433333333333335e-07, "loss": 0.0004, "num_tokens": 2736278.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 172.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01744152419269085, "kl": 0.012766205705702305, "learning_rate": 2.34e-07, "loss": 0.0006, "num_tokens": 2736538.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 172.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.1122836172580719, "kl": 0.013083487749099731, "learning_rate": 2.3366666666666665e-07, "loss": 0.0007, "num_tokens": 2736846.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 172.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.3355092704296112, "kl": 0.03203204367309809, "learning_rate": 2.3333333333333333e-07, "loss": 0.0018, "num_tokens": 2737164.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05035262182354927, "kl": 0.0025760321877896786, "learning_rate": 2.3299999999999998e-07, "loss": 0.0001, "num_tokens": 2737420.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 172.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.08612331002950668, "kl": 0.0055491626262664795, "learning_rate": 2.3266666666666669e-07, "loss": 0.0004, "num_tokens": 2737655.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 172.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.012322024442255497, "kl": 0.15765418857336044, "learning_rate": 2.3233333333333334e-07, "loss": 0.0079, "num_tokens": 2737966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07245767116546631, "kl": 0.00193899535224773, "learning_rate": 2.32e-07, "loss": 0.0001, "num_tokens": 2738238.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 172.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.024333052337169647, "kl": 0.008449589367955923, "learning_rate": 2.3166666666666666e-07, "loss": 0.0004, "num_tokens": 2738512.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 172.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030806667637079954, "kl": 4.373490810394287e-05, "learning_rate": 2.3133333333333337e-07, "loss": 0.0, "num_tokens": 2738724.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 172.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.12267035990953445, "kl": 0.019190243910998106, "learning_rate": 2.31e-07, "loss": 0.001, "num_tokens": 2739057.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.021785197779536247, "kl": 0.001960778550710529, "learning_rate": 2.306666666666667e-07, "loss": 0.0001, "num_tokens": 2739329.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 172.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.015194157138466835, "kl": 0.00043241679668426514, "learning_rate": 2.3033333333333334e-07, "loss": 0.0, "num_tokens": 2739541.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 172.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.007095846347510815, "kl": 0.00037607570993714035, "learning_rate": 2.3e-07, "loss": 0.0, "num_tokens": 2739861.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 172.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.021073205396533012, "kl": 0.0006072536125429906, "learning_rate": 2.2966666666666667e-07, "loss": 0.0, "num_tokens": 2740080.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03315271809697151, "kl": 0.007109067548299208, "learning_rate": 2.2933333333333332e-07, "loss": 0.0004, "num_tokens": 2740350.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 172.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08940380811691284, "kl": 0.05822291411459446, "learning_rate": 2.2900000000000003e-07, "loss": 0.0029, "num_tokens": 2740753.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.008731303736567497, "kl": 0.0004929818242089823, "learning_rate": 2.2866666666666665e-07, "loss": 0.0, "num_tokens": 2741015.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 172.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06337254494428635, "kl": 0.0038957372307777405, "learning_rate": 2.2833333333333335e-07, "loss": 0.0002, "num_tokens": 2741283.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 172.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.018643449991941452, "kl": 0.0019971057190559804, "learning_rate": 2.28e-07, "loss": 0.0001, "num_tokens": 2741553.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 172.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.012605863623321056, "kl": 0.000526178628206253, "learning_rate": 2.2766666666666668e-07, "loss": 0.0, "num_tokens": 2741813.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 172.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.5165688991546631, "kl": 0.041914600413292646, "learning_rate": 2.2733333333333333e-07, "loss": 0.0023, "num_tokens": 2742153.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 172.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.13938301801681519, "kl": 0.011621064433711581, "learning_rate": 2.2699999999999998e-07, "loss": 0.0006, "num_tokens": 2742437.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 172.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.613528251647949, "kl": 0.06211712956428528, "learning_rate": 2.2666666666666668e-07, "loss": -0.0025, "num_tokens": 2742814.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 172.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.276278495788574, "kl": 0.05184578709304333, "learning_rate": 2.2633333333333334e-07, "loss": -0.0061, "num_tokens": 2743141.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 172.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03256721794605255, "kl": 0.05650538019835949, "learning_rate": 2.26e-07, "loss": 0.0028, "num_tokens": 2743507.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 172.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.000321406900184229, "kl": 0.001245351741090417, "learning_rate": 2.2566666666666666e-07, "loss": 0.0001, "num_tokens": 2743787.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 172.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.004895047750324011, "kl": 0.00033820047974586487, "learning_rate": 2.2533333333333334e-07, "loss": 0.0, "num_tokens": 2744031.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 172.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.013029099442064762, "kl": 0.002477428744896315, "learning_rate": 2.25e-07, "loss": 0.0001, "num_tokens": 2744319.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 172.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.21533021330833435, "kl": 0.03718895465135574, "learning_rate": 2.246666666666667e-07, "loss": 0.0019, "num_tokens": 2744634.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 172.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.07662699371576309, "kl": 0.014657713938504457, "learning_rate": 2.2433333333333334e-07, "loss": 0.0008, "num_tokens": 2744908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 172.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.632269859313965, "kl": 0.1661722231656313, "learning_rate": 2.2400000000000002e-07, "loss": 0.4594, "num_tokens": 2745440.0, "reward": 5.550000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 5.550000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 9329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.75, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 172.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05726558715105057, "kl": 0.010915250750258565, "learning_rate": 2.2366666666666667e-07, "loss": 0.0006, "num_tokens": 2745859.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 172.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 0.6450158357620239, "kl": 0.5728609263896942, "learning_rate": 2.2333333333333332e-07, "loss": 0.0422, "num_tokens": 2746164.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 172.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.2203519493341446, "kl": 0.02634395915083587, "learning_rate": 2.23e-07, "loss": 0.0012, "num_tokens": 2746397.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 172.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03175600990653038, "kl": 0.010970419738441706, "learning_rate": 2.2266666666666665e-07, "loss": 0.0006, "num_tokens": 2746689.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 172.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.5622710585594177, "kl": 0.17441180627793074, "learning_rate": 2.2233333333333335e-07, "loss": 0.0075, "num_tokens": 2747057.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 172.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 0.5868734121322632, "kl": 0.3006749153137207, "learning_rate": 2.22e-07, "loss": -0.004, "num_tokens": 2747427.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 9335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 172.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017449725419282913, "kl": 0.0035766512155532837, "learning_rate": 2.2166666666666668e-07, "loss": 0.0002, "num_tokens": 2747663.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 172.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.01739422045648098, "kl": 0.0001802891492843628, "learning_rate": 2.2133333333333333e-07, "loss": 0.0, "num_tokens": 2747871.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 172.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.944025754928589, "kl": 0.06725234352052212, "learning_rate": 2.2100000000000003e-07, "loss": 0.1822, "num_tokens": 2748213.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 9338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.008182493038475513, "kl": 0.0011069655301980674, "learning_rate": 2.2066666666666666e-07, "loss": 0.0001, "num_tokens": 2748473.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 172.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 2.060744285583496, "kl": 0.2748560756444931, "learning_rate": 2.2033333333333336e-07, "loss": 0.015, "num_tokens": 2748821.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 172.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06978320330381393, "kl": 0.011245114263147116, "learning_rate": 2.2e-07, "loss": 0.0006, "num_tokens": 2749103.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.015462875366211, "kl": 0.05356736574321985, "learning_rate": 2.1966666666666666e-07, "loss": -0.0453, "num_tokens": 2749388.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 173.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02747127041220665, "kl": 0.005177264800295234, "learning_rate": 2.1933333333333334e-07, "loss": 0.0002, "num_tokens": 2749704.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02128387801349163, "kl": 0.02021583146415651, "learning_rate": 2.19e-07, "loss": 0.001, "num_tokens": 2749979.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 173.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.002335039433091879, "kl": 0.0014481768012046814, "learning_rate": 2.186666666666667e-07, "loss": 0.0001, "num_tokens": 2750291.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 173.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.019124042242765427, "kl": 0.0006656706391368061, "learning_rate": 2.1833333333333332e-07, "loss": 0.0, "num_tokens": 2750597.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 173.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.005986900068819523, "kl": 0.0034085522347595543, "learning_rate": 2.1800000000000002e-07, "loss": 0.0002, "num_tokens": 2750855.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 173.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.015862395986914635, "kl": 0.003157562459819019, "learning_rate": 2.1766666666666667e-07, "loss": 0.0002, "num_tokens": 2751143.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 173.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.016466058790683746, "kl": 0.04704936593770981, "learning_rate": 2.1733333333333335e-07, "loss": 0.0024, "num_tokens": 2751547.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 173.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.012452344410121441, "kl": 0.0005145557224750519, "learning_rate": 2.17e-07, "loss": 0.0, "num_tokens": 2751807.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 173.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.020508766174316406, "kl": 0.01153493532910943, "learning_rate": 2.1666666666666665e-07, "loss": 0.0004, "num_tokens": 2752161.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 173.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.013954057358205318, "kl": 0.0005302221106830984, "learning_rate": 2.1633333333333335e-07, "loss": 0.0, "num_tokens": 2752480.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.058448314666748, "kl": 0.13925595860928297, "learning_rate": 2.16e-07, "loss": -0.0518, "num_tokens": 2752744.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 173.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03587964549660683, "kl": 0.006062433822080493, "learning_rate": 2.1566666666666668e-07, "loss": 0.0003, "num_tokens": 2753026.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 173.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.6479862928390503, "kl": 0.17314231861382723, "learning_rate": 2.1533333333333333e-07, "loss": 0.0084, "num_tokens": 2753316.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 173.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.027127057313919067, "kl": 0.05456158146262169, "learning_rate": 2.15e-07, "loss": 0.0027, "num_tokens": 2753648.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 173.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.018730001524090767, "kl": 0.000610843300819397, "learning_rate": 2.1466666666666666e-07, "loss": 0.0, "num_tokens": 2753860.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 173.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 3.952392580686137e-05, "kl": 2.034008502960205e-06, "learning_rate": 2.1433333333333336e-07, "loss": 0.0, "num_tokens": 2754080.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.007977486588060856, "kl": 0.0005839644290972501, "learning_rate": 2.14e-07, "loss": 0.0, "num_tokens": 2754362.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 173.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06698492169380188, "kl": 0.017664545215666294, "learning_rate": 2.136666666666667e-07, "loss": 0.0009, "num_tokens": 2754687.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 173.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.019937697798013687, "kl": 0.00027641008637147024, "learning_rate": 2.1333333333333334e-07, "loss": 0.0, "num_tokens": 2754953.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 173.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.045142341405153275, "kl": 0.002216329798102379, "learning_rate": 2.13e-07, "loss": 0.0001, "num_tokens": 2755186.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 173.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.015254939906299114, "kl": 0.2658967822790146, "learning_rate": 2.1266666666666667e-07, "loss": 0.0133, "num_tokens": 2755490.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 173.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 5.326937675476074, "kl": 0.052946810610592365, "learning_rate": 2.1233333333333332e-07, "loss": -0.028, "num_tokens": 2755802.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 9364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 173.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.015295280143618584, "kl": 0.0021119669545441866, "learning_rate": 2.1200000000000002e-07, "loss": 0.0001, "num_tokens": 2756013.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 173.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015741040697321296, "kl": 8.230805542552844e-05, "learning_rate": 2.1166666666666667e-07, "loss": 0.0, "num_tokens": 2756233.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 173.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06418458372354507, "kl": 0.009016437456011772, "learning_rate": 2.1133333333333335e-07, "loss": 0.0005, "num_tokens": 2756577.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 173.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03234009072184563, "kl": 0.002503427502233535, "learning_rate": 2.11e-07, "loss": 0.0001, "num_tokens": 2756879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 173.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013644895516335964, "kl": 3.719329833984375e-05, "learning_rate": 2.106666666666667e-07, "loss": 0.0, "num_tokens": 2757091.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.042857646942139, "kl": 0.11064870469272137, "learning_rate": 2.1033333333333332e-07, "loss": 0.1282, "num_tokens": 2757370.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 173.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.162346363067627, "kl": 0.03916444256901741, "learning_rate": 2.1000000000000003e-07, "loss": 0.0437, "num_tokens": 2757710.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 173.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.087526835501194, "kl": 0.004464875208213925, "learning_rate": 2.0966666666666668e-07, "loss": 0.0003, "num_tokens": 2757937.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 173.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06550052762031555, "kl": 0.04014564026147127, "learning_rate": 2.0933333333333333e-07, "loss": 0.0021, "num_tokens": 2758228.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 173.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.07102903723716736, "kl": 0.02995441108942032, "learning_rate": 2.09e-07, "loss": 0.0015, "num_tokens": 2758553.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 173.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 10.05695915222168, "kl": 0.04724416509270668, "learning_rate": 2.0866666666666666e-07, "loss": 0.0338, "num_tokens": 2758798.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 9375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 173.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.008338548243045807, "kl": 0.0011786073446273804, "learning_rate": 2.0833333333333336e-07, "loss": 0.0001, "num_tokens": 2759058.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 0.42862752079963684, "kl": 0.5528724770992994, "learning_rate": 2.0799999999999998e-07, "loss": -0.0377, "num_tokens": 2759344.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009803921915590763, "clip_ratio/low_min": 0.009803921915590763, "clip_ratio/region_mean": 0.009803921915590763, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 173.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.839827537536621, "kl": 0.10357399843633175, "learning_rate": 2.0766666666666669e-07, "loss": 0.1386, "num_tokens": 2759671.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 173.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03441961854696274, "kl": 0.1616528481245041, "learning_rate": 2.0733333333333334e-07, "loss": 0.0081, "num_tokens": 2759981.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 173.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018648155964910984, "kl": 0.0035570859909057617, "learning_rate": 2.0700000000000001e-07, "loss": 0.0002, "num_tokens": 2760217.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 173.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04937727749347687, "kl": 0.00658849161118269, "learning_rate": 2.0666666666666666e-07, "loss": 0.0003, "num_tokens": 2760508.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 173.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.017838289961218834, "kl": 0.012574596330523491, "learning_rate": 2.0633333333333331e-07, "loss": 0.0006, "num_tokens": 2760768.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 173.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.8720903396606445, "kl": 0.31305863335728645, "learning_rate": 2.0600000000000002e-07, "loss": -0.0119, "num_tokens": 2761107.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 9383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.031148681417107582, "kl": 0.0011567730689421296, "learning_rate": 2.0566666666666667e-07, "loss": 0.0001, "num_tokens": 2761403.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 173.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.6330201625823975, "kl": 0.318508045602357, "learning_rate": 2.0533333333333335e-07, "loss": -0.0655, "num_tokens": 2761675.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 173.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09389530122280121, "kl": 0.0770426094532013, "learning_rate": 2.05e-07, "loss": 0.0039, "num_tokens": 2762050.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 173.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 3.0350518226623535, "kl": 0.09583992511034012, "learning_rate": 2.0466666666666667e-07, "loss": -0.0099, "num_tokens": 2762411.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 9387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.11118227988481522, "kl": 0.01862932974472642, "learning_rate": 2.0433333333333332e-07, "loss": 0.0009, "num_tokens": 2762686.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 173.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.00037408937350846827, "kl": 0.0012370496988296509, "learning_rate": 2.0400000000000003e-07, "loss": 0.0001, "num_tokens": 2762966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 173.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014750309055671096, "kl": 6.294846571108792e-05, "learning_rate": 2.0366666666666668e-07, "loss": 0.0, "num_tokens": 2763222.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 173.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.09263962507247925, "kl": 0.01663209032267332, "learning_rate": 2.0333333333333335e-07, "loss": 0.0009, "num_tokens": 2763564.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 173.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.029124464839696884, "kl": 0.008848333265632391, "learning_rate": 2.03e-07, "loss": 0.0004, "num_tokens": 2763838.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9392 }, { "clip_ratio/high_max": 0.006666666828095913, "clip_ratio/high_mean": 0.006666666828095913, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006666666828095913, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 173.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.122303009033203, "kl": 0.14825211837887764, "learning_rate": 2.0266666666666666e-07, "loss": -0.0785, "num_tokens": 2764206.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 9393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 173.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03227252885699272, "kl": 0.039860278367996216, "learning_rate": 2.0233333333333333e-07, "loss": 0.002, "num_tokens": 2764578.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 173.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 6.471002101898193, "kl": 0.032280536368489265, "learning_rate": 2.0199999999999998e-07, "loss": 0.3484, "num_tokens": 2764878.0, "reward": 5.875, "reward_std": 2.136000871658325, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 2.136000871658325, "step": 9395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 174.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.07729319483041763, "kl": 0.002565067959949374, "learning_rate": 2.0166666666666669e-07, "loss": 0.0001, "num_tokens": 2765147.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 174.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03395868465304375, "kl": 0.00591424060985446, "learning_rate": 2.0133333333333334e-07, "loss": 0.0003, "num_tokens": 2765478.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 174.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.013296353630721569, "kl": 0.000528055927134119, "learning_rate": 2.01e-07, "loss": 0.0, "num_tokens": 2765798.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 174.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.008081411942839622, "kl": 0.001099076820537448, "learning_rate": 2.0066666666666666e-07, "loss": 0.0001, "num_tokens": 2766058.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06403493881225586, "kl": 0.0036330987350083888, "learning_rate": 2.0033333333333337e-07, "loss": 0.0002, "num_tokens": 2766356.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 174.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.05856318026781082, "kl": 0.013797045103274286, "learning_rate": 2e-07, "loss": 0.0007, "num_tokens": 2766642.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 174.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03477952256798744, "kl": 0.008628587384009734, "learning_rate": 1.996666666666667e-07, "loss": 0.0005, "num_tokens": 2766929.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 174.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02780861221253872, "kl": 0.0044317287392914295, "learning_rate": 1.9933333333333334e-07, "loss": 0.0002, "num_tokens": 2767211.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 174.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03354954719543457, "kl": 0.003975623272708617, "learning_rate": 1.99e-07, "loss": 0.0002, "num_tokens": 2767471.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 174.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.041201986372470856, "kl": 0.05074997805058956, "learning_rate": 1.9866666666666667e-07, "loss": 0.0026, "num_tokens": 2767839.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 174.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028058665338903666, "kl": 0.00010259449481964111, "learning_rate": 1.9833333333333332e-07, "loss": 0.0, "num_tokens": 2768051.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.027216970920562744, "kl": 0.005215051583945751, "learning_rate": 1.9800000000000003e-07, "loss": 0.0003, "num_tokens": 2768319.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 174.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.15924301743507385, "kl": 0.026444242801517248, "learning_rate": 1.9766666666666665e-07, "loss": 0.0012, "num_tokens": 2768626.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 174.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04549303650856018, "kl": 0.030992218293249607, "learning_rate": 1.9733333333333335e-07, "loss": 0.0015, "num_tokens": 2768969.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 174.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.20801319181919098, "kl": 0.10140269249677658, "learning_rate": 1.97e-07, "loss": 0.0048, "num_tokens": 2769333.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 174.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.017625324428081512, "kl": 0.000512242317199707, "learning_rate": 1.9666666666666668e-07, "loss": 0.0, "num_tokens": 2769545.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06326346099376678, "kl": 0.004295312101021409, "learning_rate": 1.9633333333333333e-07, "loss": 0.0002, "num_tokens": 2769814.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 174.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01450822688639164, "kl": 0.26604562997817993, "learning_rate": 1.9599999999999998e-07, "loss": 0.0133, "num_tokens": 2770118.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 174.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02843433804810047, "kl": 0.002526771044358611, "learning_rate": 1.9566666666666668e-07, "loss": 0.0001, "num_tokens": 2770422.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 174.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02168596349656582, "kl": 0.0026514212950132787, "learning_rate": 1.953333333333333e-07, "loss": 0.0001, "num_tokens": 2770710.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 174.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.001842870144173503, "kl": 0.0035613924264907837, "learning_rate": 1.95e-07, "loss": 0.0002, "num_tokens": 2770946.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 174.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.1765530109405518, "kl": 0.03562296088784933, "learning_rate": 1.9466666666666666e-07, "loss": 0.1266, "num_tokens": 2771261.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 174.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.11406844854354858, "kl": 0.03590277023613453, "learning_rate": 1.9433333333333334e-07, "loss": 0.0017, "num_tokens": 2771591.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 174.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.003839787095785141, "kl": 0.0001465529203414917, "learning_rate": 1.94e-07, "loss": 0.0, "num_tokens": 2771799.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 174.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02580348588526249, "kl": 0.0061992957489565015, "learning_rate": 1.936666666666667e-07, "loss": 0.0003, "num_tokens": 2772136.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 174.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.002588248113170266, "kl": 0.00010200142423855141, "learning_rate": 1.9333333333333334e-07, "loss": 0.0, "num_tokens": 2772356.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 174.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.020115790888667107, "kl": 0.0062670658044226, "learning_rate": 1.9300000000000002e-07, "loss": 0.0003, "num_tokens": 2772628.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 174.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.270094156265259, "kl": 0.16399965435266495, "learning_rate": 1.9266666666666667e-07, "loss": 0.0078, "num_tokens": 2772996.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 9423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 174.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.376574754714966, "kl": 0.07148301228880882, "learning_rate": 1.9233333333333332e-07, "loss": 0.0804, "num_tokens": 2773340.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 174.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009622232057154179, "kl": 0.0006956764264032245, "learning_rate": 1.92e-07, "loss": 0.0, "num_tokens": 2773654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 174.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.022656532004475594, "kl": 0.001196057244669646, "learning_rate": 1.9166666666666665e-07, "loss": 0.0001, "num_tokens": 2773932.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 174.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04341169074177742, "kl": 0.01216940488666296, "learning_rate": 1.9133333333333335e-07, "loss": 0.0006, "num_tokens": 2774193.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 174.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.06408816576004028, "kl": 0.028108091093599796, "learning_rate": 1.91e-07, "loss": 0.0014, "num_tokens": 2774465.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 174.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04262426495552063, "kl": 0.004996336298063397, "learning_rate": 1.9066666666666668e-07, "loss": 0.0002, "num_tokens": 2774733.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 174.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.037320878356695175, "kl": 0.0019663242273963988, "learning_rate": 1.9033333333333333e-07, "loss": 0.0001, "num_tokens": 2774976.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.000432979897595942, "kl": 0.0013146064011380076, "learning_rate": 1.9000000000000003e-07, "loss": 0.0001, "num_tokens": 2775253.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 174.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 7.307594933081418e-05, "kl": 2.250075340270996e-06, "learning_rate": 1.8966666666666666e-07, "loss": 0.0, "num_tokens": 2775473.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 174.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02457534521818161, "kl": 0.05075225606560707, "learning_rate": 1.8933333333333336e-07, "loss": 0.0025, "num_tokens": 2775809.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 174.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04618788883090019, "kl": 0.007547435350716114, "learning_rate": 1.89e-07, "loss": 0.0004, "num_tokens": 2776101.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 174.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04452917352318764, "kl": 0.011068075662478805, "learning_rate": 1.8866666666666666e-07, "loss": 0.0006, "num_tokens": 2776438.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 174.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.7525200843811035, "kl": 0.1560510378330946, "learning_rate": 1.8833333333333334e-07, "loss": 0.0045, "num_tokens": 2776794.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 174.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.6089701652526855, "kl": 0.06847019167616963, "learning_rate": 1.88e-07, "loss": 0.0041, "num_tokens": 2777089.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 9437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 174.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.260293960571289, "kl": 0.13297653547488153, "learning_rate": 1.876666666666667e-07, "loss": 0.0963, "num_tokens": 2777393.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 174.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0437699556350708, "kl": 0.001357494038529694, "learning_rate": 1.8733333333333332e-07, "loss": 0.0001, "num_tokens": 2777627.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 174.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.380171060562134, "kl": 0.06992382183670998, "learning_rate": 1.87e-07, "loss": 0.0366, "num_tokens": 2777934.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 174.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03872547671198845, "kl": 0.04057466797530651, "learning_rate": 1.8666666666666667e-07, "loss": 0.002, "num_tokens": 2778338.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 174.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.76997309923172, "kl": 0.043212915770709515, "learning_rate": 1.8633333333333335e-07, "loss": 0.0029, "num_tokens": 2778620.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 174.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.02050241455435753, "kl": 0.011363848112523556, "learning_rate": 1.86e-07, "loss": 0.0006, "num_tokens": 2778934.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 174.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011737901950255036, "kl": 0.000479087233543396, "learning_rate": 1.8566666666666667e-07, "loss": 0.0, "num_tokens": 2779194.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 174.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.20299763977527618, "kl": 0.02106556110084057, "learning_rate": 1.8533333333333335e-07, "loss": 0.0013, "num_tokens": 2779478.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 174.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.10031075030565262, "kl": 0.0112114567309618, "learning_rate": 1.85e-07, "loss": 0.0006, "num_tokens": 2779751.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 174.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0064816526137292385, "kl": 0.1617884263396263, "learning_rate": 1.8466666666666668e-07, "loss": 0.0081, "num_tokens": 2780060.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 174.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03821108117699623, "kl": 0.0007517486810684204, "learning_rate": 1.8433333333333336e-07, "loss": 0.0, "num_tokens": 2780316.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 174.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02685854770243168, "kl": 0.0031300827395170927, "learning_rate": 1.84e-07, "loss": 0.0002, "num_tokens": 2780548.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 175.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0779147669672966, "kl": 0.010218548122793436, "learning_rate": 1.8366666666666666e-07, "loss": 0.0005, "num_tokens": 2780879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 175.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.016581114381551743, "kl": 0.00044539570808410645, "learning_rate": 1.8333333333333333e-07, "loss": 0.0, "num_tokens": 2781091.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 175.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.044473301619291306, "kl": 0.011632442474365234, "learning_rate": 1.83e-07, "loss": 0.0006, "num_tokens": 2781416.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.1050993874669075, "kl": 0.023846641182899475, "learning_rate": 1.8266666666666666e-07, "loss": 0.0012, "num_tokens": 2781690.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 175.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07377281785011292, "kl": 0.006855746265500784, "learning_rate": 1.8233333333333334e-07, "loss": 0.0003, "num_tokens": 2781990.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 175.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.08001474291086197, "kl": 0.008238217793405056, "learning_rate": 1.8200000000000002e-07, "loss": 0.0004, "num_tokens": 2782262.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 175.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0440627820789814, "kl": 0.03242575004696846, "learning_rate": 1.8166666666666667e-07, "loss": 0.0016, "num_tokens": 2782562.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 175.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 5.663631439208984, "kl": 0.04502807557582855, "learning_rate": 1.8133333333333334e-07, "loss": 0.0193, "num_tokens": 2782823.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 175.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.7345893383026123, "kl": 0.012640192173421383, "learning_rate": 1.8100000000000002e-07, "loss": 0.0245, "num_tokens": 2783181.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 175.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011049036402255297, "kl": 2.436339855194092e-06, "learning_rate": 1.8066666666666667e-07, "loss": 0.0, "num_tokens": 2783401.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 175.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.003071162384003401, "kl": 0.0004178136441623792, "learning_rate": 1.8033333333333332e-07, "loss": 0.0, "num_tokens": 2783620.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 175.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.6407954692840576, "kl": 0.0012169579276815057, "learning_rate": 1.8e-07, "loss": 0.0003, "num_tokens": 2783892.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 175.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05120643600821495, "kl": 0.008119082893244922, "learning_rate": 1.7966666666666667e-07, "loss": 0.0005, "num_tokens": 2784192.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 175.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.12649521231651306, "kl": 0.09553695470094681, "learning_rate": 1.7933333333333332e-07, "loss": 0.0048, "num_tokens": 2784561.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 175.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.027567535638809204, "kl": 0.0007620364413014613, "learning_rate": 1.79e-07, "loss": 0.0, "num_tokens": 2784817.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 175.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03359556570649147, "kl": 0.003997477513621561, "learning_rate": 1.7866666666666668e-07, "loss": 0.0002, "num_tokens": 2785077.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 175.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.083592891693115, "kl": 0.10924112051725388, "learning_rate": 1.7833333333333333e-07, "loss": 0.0732, "num_tokens": 2785428.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 175.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05745260789990425, "kl": 0.03714505583047867, "learning_rate": 1.78e-07, "loss": 0.0019, "num_tokens": 2785805.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 175.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.015942562371492386, "kl": 0.006867463467642665, "learning_rate": 1.7766666666666668e-07, "loss": 0.0003, "num_tokens": 2786101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 175.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09808575361967087, "kl": 0.012375690042972565, "learning_rate": 1.7733333333333336e-07, "loss": 0.0006, "num_tokens": 2786394.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 175.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.027535833418369293, "kl": 0.004792378516867757, "learning_rate": 1.7699999999999998e-07, "loss": 0.0003, "num_tokens": 2786733.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 175.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.024536410346627235, "kl": 0.002142351266229525, "learning_rate": 1.7666666666666666e-07, "loss": 0.0001, "num_tokens": 2787060.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 175.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.055180296301841736, "kl": 0.004505733493715525, "learning_rate": 1.7633333333333334e-07, "loss": 0.0002, "num_tokens": 2787330.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 175.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.054977238178253174, "kl": 0.013792762532830238, "learning_rate": 1.7600000000000001e-07, "loss": 0.0007, "num_tokens": 2787659.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03392626717686653, "kl": 0.01006307639181614, "learning_rate": 1.7566666666666666e-07, "loss": 0.0005, "num_tokens": 2787941.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 175.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.007234607823193073, "kl": 0.0014080610708333552, "learning_rate": 1.7533333333333334e-07, "loss": 0.0001, "num_tokens": 2788218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 175.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.010728621855378151, "kl": 0.0011618470889516175, "learning_rate": 1.7500000000000002e-07, "loss": 0.0001, "num_tokens": 2788494.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 175.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.020322097465395927, "kl": 0.002917998470366001, "learning_rate": 1.7466666666666667e-07, "loss": 0.0001, "num_tokens": 2788806.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 175.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.028154859319329262, "kl": 0.03807249292731285, "learning_rate": 1.7433333333333335e-07, "loss": 0.0019, "num_tokens": 2789211.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.010237826965749264, "kl": 0.007193590514361858, "learning_rate": 1.7400000000000002e-07, "loss": 0.0004, "num_tokens": 2789483.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 175.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.06910774111747742, "kl": 0.006038739811629057, "learning_rate": 1.7366666666666667e-07, "loss": 0.0003, "num_tokens": 2789754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 175.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.6817761063575745, "kl": 0.11062488332390785, "learning_rate": 1.7333333333333332e-07, "loss": 0.0058, "num_tokens": 2790135.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 175.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.004457696806639433, "kl": 0.00023955106735229492, "learning_rate": 1.73e-07, "loss": 0.0, "num_tokens": 2790379.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0049491701647639275, "kl": 0.0005636787973344326, "learning_rate": 1.7266666666666668e-07, "loss": 0.0, "num_tokens": 2790663.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 175.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.07905175536870956, "kl": 0.00984371779486537, "learning_rate": 1.7233333333333333e-07, "loss": 0.0005, "num_tokens": 2790925.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 175.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016738688573241234, "kl": 0.000374002120224759, "learning_rate": 1.72e-07, "loss": 0.0, "num_tokens": 2791161.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9485 }, { "clip_ratio/high_max": 0.007936508394777775, "clip_ratio/high_mean": 0.007936508394777775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007936508394777775, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 175.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.760868549346924, "kl": 0.13168331235647202, "learning_rate": 1.7166666666666668e-07, "loss": -0.0298, "num_tokens": 2791511.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 175.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.003948070574551821, "kl": 0.000350169837474823, "learning_rate": 1.7133333333333333e-07, "loss": 0.0, "num_tokens": 2791771.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 175.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0054215374402701855, "kl": 0.0015189126133918762, "learning_rate": 1.71e-07, "loss": 0.0001, "num_tokens": 2791987.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 175.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.437551259994507, "kl": 0.10894716624170542, "learning_rate": 1.7066666666666669e-07, "loss": 0.0466, "num_tokens": 2792323.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.05501972511410713, "kl": 0.013732909690588713, "learning_rate": 1.7033333333333334e-07, "loss": 0.0007, "num_tokens": 2792610.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 175.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.07589367777109146, "kl": 0.02821679785847664, "learning_rate": 1.7e-07, "loss": 0.0015, "num_tokens": 2792912.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 175.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.030610423535108566, "kl": 0.0006005242466926575, "learning_rate": 1.6966666666666666e-07, "loss": 0.0, "num_tokens": 2793122.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 175.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.014374910853803158, "kl": 0.001605411758646369, "learning_rate": 1.6933333333333334e-07, "loss": 0.0001, "num_tokens": 2793452.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 175.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.328324317932129, "kl": 0.13618933409452438, "learning_rate": 1.69e-07, "loss": -0.3253, "num_tokens": 2793746.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 175.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.030537644401192665, "kl": 0.0034228264703415334, "learning_rate": 1.6866666666666667e-07, "loss": 0.0002, "num_tokens": 2794004.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 175.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05691957101225853, "kl": 0.007668401347473264, "learning_rate": 1.6833333333333335e-07, "loss": 0.0004, "num_tokens": 2794326.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 175.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.026415415108203888, "kl": 0.0015674991300329566, "learning_rate": 1.68e-07, "loss": 0.0001, "num_tokens": 2794622.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 175.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016319707501679659, "kl": 0.00015536861610598862, "learning_rate": 1.6766666666666667e-07, "loss": 0.0, "num_tokens": 2794936.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 175.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.046219997107982635, "kl": 0.006617294391617179, "learning_rate": 1.6733333333333335e-07, "loss": 0.0003, "num_tokens": 2795229.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 175.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012055712286382914, "kl": 2.7142465114593506e-05, "learning_rate": 1.6700000000000003e-07, "loss": 0.0, "num_tokens": 2795441.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 175.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.015128085389733315, "kl": 0.2658979743719101, "learning_rate": 1.6666666666666665e-07, "loss": 0.0133, "num_tokens": 2795745.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 175.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0355367511510849, "kl": 0.15592289716005325, "learning_rate": 1.6633333333333333e-07, "loss": 0.0078, "num_tokens": 2796058.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 175.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03484673053026199, "kl": 0.021474342793226242, "learning_rate": 1.66e-07, "loss": 0.0012, "num_tokens": 2796335.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019219344248995185, "kl": 0.0035479143261909485, "learning_rate": 1.6566666666666665e-07, "loss": 0.0002, "num_tokens": 2796571.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 176.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05576057359576225, "kl": 0.028212859178893268, "learning_rate": 1.6533333333333333e-07, "loss": 0.0014, "num_tokens": 2796843.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 176.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.032508548349142075, "kl": 0.004489997983910143, "learning_rate": 1.65e-07, "loss": 0.0002, "num_tokens": 2797111.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 176.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.10765808075666428, "kl": 0.03259006887674332, "learning_rate": 1.6466666666666669e-07, "loss": 0.0016, "num_tokens": 2797493.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 176.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.296911716461182, "kl": 0.06652853265404701, "learning_rate": 1.6433333333333334e-07, "loss": -0.0113, "num_tokens": 2797778.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 9508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 176.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.014875221066176891, "kl": 0.0035690803051693365, "learning_rate": 1.64e-07, "loss": 0.0001, "num_tokens": 2798038.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018377848900854588, "kl": 0.0035630017518997192, "learning_rate": 1.636666666666667e-07, "loss": 0.0002, "num_tokens": 2798274.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 176.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.020880894735455513, "kl": 0.0007867937456467189, "learning_rate": 1.6333333333333334e-07, "loss": 0.0, "num_tokens": 2798588.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 176.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04607066884636879, "kl": 0.003699503722600639, "learning_rate": 1.63e-07, "loss": 0.0002, "num_tokens": 2798890.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 176.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.017043661326169968, "kl": 0.012803932186216116, "learning_rate": 1.6266666666666667e-07, "loss": 0.0006, "num_tokens": 2799150.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 176.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.6555036306381226, "kl": 0.02057200577110052, "learning_rate": 1.6233333333333334e-07, "loss": 0.0007, "num_tokens": 2799502.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 176.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.027277125045657158, "kl": 0.0009510765812592581, "learning_rate": 1.62e-07, "loss": 0.0, "num_tokens": 2799737.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 176.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.1946507841348648, "kl": 0.023479865863919258, "learning_rate": 1.6166666666666667e-07, "loss": 0.0012, "num_tokens": 2800013.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 176.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.08500733226537704, "kl": 0.026968365535140038, "learning_rate": 1.6133333333333335e-07, "loss": 0.0014, "num_tokens": 2800301.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 176.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 4.235119342803955, "kl": 0.09658414218574762, "learning_rate": 1.61e-07, "loss": 0.0434, "num_tokens": 2800608.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 176.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07327579706907272, "kl": 0.011227508570300415, "learning_rate": 1.6066666666666668e-07, "loss": 0.0004, "num_tokens": 2800927.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 176.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04222364351153374, "kl": 0.03229001723229885, "learning_rate": 1.6033333333333335e-07, "loss": 0.0017, "num_tokens": 2801288.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 176.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03203504532575607, "kl": 0.002618239726871252, "learning_rate": 1.6e-07, "loss": 0.0001, "num_tokens": 2801560.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 176.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 4.730062007904053, "kl": 0.06036641966784373, "learning_rate": 1.5966666666666665e-07, "loss": 0.1174, "num_tokens": 2801858.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 176.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08959072828292847, "kl": 0.03773561678826809, "learning_rate": 1.5933333333333333e-07, "loss": 0.0019, "num_tokens": 2802276.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 176.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0642068162560463, "kl": 0.008523573633283377, "learning_rate": 1.59e-07, "loss": 0.0004, "num_tokens": 2802568.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 176.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.007662872783839703, "kl": 0.0014800818171352148, "learning_rate": 1.5866666666666666e-07, "loss": 0.0001, "num_tokens": 2802842.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 176.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.028721291571855545, "kl": 0.01291783805936575, "learning_rate": 1.5833333333333333e-07, "loss": 0.0007, "num_tokens": 2803114.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 176.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04732772707939148, "kl": 0.001399710774421692, "learning_rate": 1.58e-07, "loss": 0.0001, "num_tokens": 2803330.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 176.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.011455900967121124, "kl": 0.0016176364151760936, "learning_rate": 1.5766666666666666e-07, "loss": 0.0001, "num_tokens": 2803658.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.999152183532715, "kl": 0.7840666137635708, "learning_rate": 1.5733333333333334e-07, "loss": -0.1398, "num_tokens": 2803898.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 9529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 176.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.9502185583114624, "kl": 0.3401731550693512, "learning_rate": 1.5700000000000002e-07, "loss": 0.0183, "num_tokens": 2804220.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 176.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07650213688611984, "kl": 0.03174298210069537, "learning_rate": 1.566666666666667e-07, "loss": 0.0015, "num_tokens": 2804527.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 176.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10376128554344177, "kl": 0.00576256331987679, "learning_rate": 1.5633333333333332e-07, "loss": 0.0002, "num_tokens": 2804795.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 176.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09101668000221252, "kl": 0.01548024226212874, "learning_rate": 1.56e-07, "loss": 0.0009, "num_tokens": 2805095.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 176.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.09454462677240372, "kl": 0.039939695969223976, "learning_rate": 1.5566666666666667e-07, "loss": 0.0022, "num_tokens": 2805437.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 176.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.005667231045663357, "kl": 0.1616503745317459, "learning_rate": 1.5533333333333332e-07, "loss": 0.0081, "num_tokens": 2805746.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 176.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 1.5714551210403442, "kl": 0.07886955514550209, "learning_rate": 1.55e-07, "loss": 0.0049, "num_tokens": 2806120.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 176.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.033408455550670624, "kl": 0.008181490702554584, "learning_rate": 1.5466666666666668e-07, "loss": 0.0005, "num_tokens": 2806447.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 176.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03365100175142288, "kl": 0.0007914025336503983, "learning_rate": 1.5433333333333335e-07, "loss": 0.0, "num_tokens": 2806704.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 176.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03636021167039871, "kl": 0.00307451281696558, "learning_rate": 1.54e-07, "loss": 0.0002, "num_tokens": 2807016.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 176.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.005419056862592697, "kl": 0.00020385532843647525, "learning_rate": 1.5366666666666668e-07, "loss": 0.0, "num_tokens": 2807288.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 176.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.12775497138500214, "kl": 0.013672416796907783, "learning_rate": 1.5333333333333336e-07, "loss": 0.0007, "num_tokens": 2807624.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 176.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.029871614649891853, "kl": 0.003635496774222702, "learning_rate": 1.53e-07, "loss": 0.0002, "num_tokens": 2807882.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 176.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.061885356903076, "kl": 0.13871409744024277, "learning_rate": 1.5266666666666666e-07, "loss": 0.0086, "num_tokens": 2808254.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 9543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 176.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.005721048917621374, "kl": 0.0005310453125275671, "learning_rate": 1.5233333333333333e-07, "loss": 0.0, "num_tokens": 2808538.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 2.8619801014428958e-05, "kl": 2.3692846298217773e-06, "learning_rate": 1.52e-07, "loss": 0.0, "num_tokens": 2808758.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07172656804323196, "kl": 0.00579028413631022, "learning_rate": 1.5166666666666666e-07, "loss": 0.0004, "num_tokens": 2808989.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 176.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07688991725444794, "kl": 0.003436783794313669, "learning_rate": 1.5133333333333334e-07, "loss": 0.0002, "num_tokens": 2809253.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 176.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0055934228003025055, "kl": 0.00033292174339294434, "learning_rate": 1.5100000000000002e-07, "loss": 0.0, "num_tokens": 2809461.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 176.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06430573761463165, "kl": 0.0026778578758239746, "learning_rate": 1.5066666666666667e-07, "loss": 0.0001, "num_tokens": 2809721.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9549 }, { "clip_ratio/high_max": 0.008771929889917374, "clip_ratio/high_mean": 0.008771929889917374, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008771929889917374, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 176.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.503368377685547, "kl": 0.4967469163239002, "learning_rate": 1.5033333333333334e-07, "loss": 0.0086, "num_tokens": 2810090.0, "reward": 3.375, "reward_std": 4.8712592124938965, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 4.8712592124938965, "step": 9550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.029600251466035843, "kl": 0.0017618819620111026, "learning_rate": 1.5000000000000002e-07, "loss": 0.0001, "num_tokens": 2810309.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 176.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.008202710188925266, "kl": 0.0003418431442696601, "learning_rate": 1.4966666666666667e-07, "loss": 0.0, "num_tokens": 2810626.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 176.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.031754862517118454, "kl": 0.007476957864128053, "learning_rate": 1.4933333333333332e-07, "loss": 0.0003, "num_tokens": 2810982.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024407673627138138, "kl": 1.6644597053527832e-05, "learning_rate": 1.49e-07, "loss": 0.0, "num_tokens": 2811194.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 176.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03923211246728897, "kl": 0.004935101140290499, "learning_rate": 1.4866666666666667e-07, "loss": 0.0002, "num_tokens": 2811494.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.046997226774692535, "kl": 0.0022775634424760938, "learning_rate": 1.4833333333333332e-07, "loss": 0.0001, "num_tokens": 2811748.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 176.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.456135272979736, "kl": 0.03165629622526467, "learning_rate": 1.48e-07, "loss": 0.0128, "num_tokens": 2812045.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 9557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 177.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.6208432912826538, "kl": 0.09374172985553741, "learning_rate": 1.4766666666666668e-07, "loss": -0.0305, "num_tokens": 2812426.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 177.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.036625511944293976, "kl": 0.003062780946493149, "learning_rate": 1.4733333333333333e-07, "loss": 0.0002, "num_tokens": 2812738.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 177.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.13599556684494019, "kl": 0.0067896172404289246, "learning_rate": 1.47e-07, "loss": 0.0003, "num_tokens": 2812982.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 177.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02631358802318573, "kl": 0.0012416014797054231, "learning_rate": 1.4666666666666668e-07, "loss": 0.0001, "num_tokens": 2813272.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 177.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.12596699595451355, "kl": 0.06198902800679207, "learning_rate": 1.4633333333333336e-07, "loss": 0.0031, "num_tokens": 2813642.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 177.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.019427374005317688, "kl": 0.0008019626038731076, "learning_rate": 1.4599999999999998e-07, "loss": 0.0, "num_tokens": 2813898.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 177.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04771993309259415, "kl": 0.026112916879355907, "learning_rate": 1.4566666666666666e-07, "loss": 0.0014, "num_tokens": 2814186.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 177.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.23239776492118835, "kl": 0.041260702069848776, "learning_rate": 1.4533333333333334e-07, "loss": 0.002, "num_tokens": 2814482.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 177.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06313328444957733, "kl": 0.004499338800087571, "learning_rate": 1.45e-07, "loss": 0.0002, "num_tokens": 2814746.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 177.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0063460092060267925, "kl": 0.16387245059013367, "learning_rate": 1.4466666666666667e-07, "loss": 0.0082, "num_tokens": 2815054.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 177.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07178795337677002, "kl": 0.005559865618124604, "learning_rate": 1.4433333333333334e-07, "loss": 0.0003, "num_tokens": 2815331.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 177.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0264766663312912, "kl": 0.0019895622390322387, "learning_rate": 1.4400000000000002e-07, "loss": 0.0001, "num_tokens": 2815631.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 177.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.019687240943312645, "kl": 0.09615558013319969, "learning_rate": 1.4366666666666667e-07, "loss": 0.0048, "num_tokens": 2816003.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 177.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.003190296469256282, "kl": 0.0001267552434001118, "learning_rate": 1.4333333333333335e-07, "loss": 0.0, "num_tokens": 2816223.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 177.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.06155973672866821, "kl": 0.05993828922510147, "learning_rate": 1.4300000000000002e-07, "loss": 0.003, "num_tokens": 2816591.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 177.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.005751395132392645, "kl": 0.0003493606927804649, "learning_rate": 1.4266666666666667e-07, "loss": 0.0, "num_tokens": 2816851.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 177.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.057162024080753326, "kl": 0.011976105161011219, "learning_rate": 1.4233333333333332e-07, "loss": 0.0006, "num_tokens": 2817194.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 177.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 7.441741036018357e-05, "kl": 2.294778823852539e-06, "learning_rate": 1.42e-07, "loss": 0.0, "num_tokens": 2817414.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 177.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.027738021686673164, "kl": 0.026988955214619637, "learning_rate": 1.4166666666666668e-07, "loss": 0.0014, "num_tokens": 2817716.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 177.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.011388730257749557, "kl": 0.011602215701714158, "learning_rate": 1.4133333333333333e-07, "loss": 0.0007, "num_tokens": 2817990.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 177.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017021670937538147, "kl": 0.0005283690989017487, "learning_rate": 1.41e-07, "loss": 0.0, "num_tokens": 2818250.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 177.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.08981374651193619, "kl": 0.012530907988548279, "learning_rate": 1.4066666666666668e-07, "loss": 0.0007, "num_tokens": 2818536.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 177.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 4.168918609619141, "kl": 0.15909979492425919, "learning_rate": 1.4033333333333333e-07, "loss": 0.2073, "num_tokens": 2818828.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 9580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 177.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 12.229592323303223, "kl": 1.2701507389429025, "learning_rate": 1.4e-07, "loss": 0.0602, "num_tokens": 2819087.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 177.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.011176024563610554, "kl": 0.0072957759257406, "learning_rate": 1.3966666666666669e-07, "loss": 0.0004, "num_tokens": 2819359.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 177.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.030984047800302505, "kl": 0.005335319088771939, "learning_rate": 1.3933333333333334e-07, "loss": 0.0003, "num_tokens": 2819654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 177.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09007994830608368, "kl": 0.0016735196113586426, "learning_rate": 1.39e-07, "loss": 0.0001, "num_tokens": 2819874.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 177.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07154640555381775, "kl": 0.004637794831069186, "learning_rate": 1.3866666666666666e-07, "loss": 0.0002, "num_tokens": 2820148.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 177.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.20592337846755981, "kl": 0.01621579035418108, "learning_rate": 1.3833333333333334e-07, "loss": 0.0009, "num_tokens": 2820434.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 177.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.017338179051876068, "kl": 0.012760586105287075, "learning_rate": 1.38e-07, "loss": 0.0006, "num_tokens": 2820694.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 177.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.019098149612545967, "kl": 0.0020502295228652656, "learning_rate": 1.3766666666666667e-07, "loss": 0.0001, "num_tokens": 2820964.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 177.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 0.17156332731246948, "kl": 0.37659671571600484, "learning_rate": 1.3733333333333335e-07, "loss": 0.0039, "num_tokens": 2821275.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 177.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 2.4867427349090576, "kl": 0.06598281487822533, "learning_rate": 1.37e-07, "loss": -0.0335, "num_tokens": 2821675.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 9590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 177.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.017378803342580795, "kl": 0.0003652125597000122, "learning_rate": 1.3666666666666667e-07, "loss": 0.0, "num_tokens": 2821881.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 177.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.11058441549539566, "kl": 0.01982554141432047, "learning_rate": 1.3633333333333335e-07, "loss": 0.0012, "num_tokens": 2822160.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 177.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02188039757311344, "kl": 0.010243285563774407, "learning_rate": 1.36e-07, "loss": 0.0004, "num_tokens": 2822483.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 177.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.07262875884771347, "kl": 0.01694716513156891, "learning_rate": 1.3566666666666665e-07, "loss": 0.0008, "num_tokens": 2822809.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 177.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01224533375352621, "kl": 0.0004906567046418786, "learning_rate": 1.3533333333333333e-07, "loss": 0.0, "num_tokens": 2823132.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 177.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.002286963164806366, "kl": 2.419203519821167e-05, "learning_rate": 1.35e-07, "loss": 0.0, "num_tokens": 2823344.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 177.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.004637403879314661, "kl": 0.0001556376664666459, "learning_rate": 1.3466666666666665e-07, "loss": 0.0, "num_tokens": 2823616.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 177.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 6.643313407897949, "kl": 0.02667102124541998, "learning_rate": 1.3433333333333333e-07, "loss": 0.0297, "num_tokens": 2823951.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 177.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.025852812454104424, "kl": 0.002269966993480921, "learning_rate": 1.34e-07, "loss": 0.0001, "num_tokens": 2824184.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 177.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.1519833207130432, "kl": 0.03485689498484135, "learning_rate": 1.3366666666666669e-07, "loss": 0.0018, "num_tokens": 2824522.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 177.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003160873893648386, "kl": 0.0012445024913176894, "learning_rate": 1.3333333333333334e-07, "loss": 0.0001, "num_tokens": 2824802.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 177.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04331769049167633, "kl": 0.0060004518600180745, "learning_rate": 1.33e-07, "loss": 0.0003, "num_tokens": 2825122.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 177.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.029098965227603912, "kl": 0.0016439953760709614, "learning_rate": 1.326666666666667e-07, "loss": 0.0001, "num_tokens": 2825357.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 177.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.8213188648223877, "kl": 0.03336055390536785, "learning_rate": 1.3233333333333331e-07, "loss": 0.0282, "num_tokens": 2825684.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 177.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.9011693000793457, "kl": 0.08182980120182037, "learning_rate": 1.32e-07, "loss": -0.0125, "num_tokens": 2825987.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 177.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.09659013897180557, "kl": 0.017395183676853776, "learning_rate": 1.3166666666666667e-07, "loss": 0.0009, "num_tokens": 2826321.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 177.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.064116932451725, "kl": 0.048350848257541656, "learning_rate": 1.3133333333333334e-07, "loss": 0.0024, "num_tokens": 2826659.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 177.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06287820637226105, "kl": 0.005229388130828738, "learning_rate": 1.31e-07, "loss": 0.0003, "num_tokens": 2826957.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 177.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.030907947570085526, "kl": 0.0037443265318870544, "learning_rate": 1.3066666666666667e-07, "loss": 0.0002, "num_tokens": 2827248.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 177.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02483222633600235, "kl": 0.018993492238223553, "learning_rate": 1.3033333333333335e-07, "loss": 0.001, "num_tokens": 2827609.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 177.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0145722646266222, "kl": 0.2659342437982559, "learning_rate": 1.3e-07, "loss": 0.0133, "num_tokens": 2827913.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019047962268814445, "kl": 0.0035469159483909607, "learning_rate": 1.2966666666666668e-07, "loss": 0.0002, "num_tokens": 2828149.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 178.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03985825553536415, "kl": 0.007996839005500078, "learning_rate": 1.2933333333333335e-07, "loss": 0.0004, "num_tokens": 2828438.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 178.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06788503378629684, "kl": 0.06598960235714912, "learning_rate": 1.29e-07, "loss": 0.0033, "num_tokens": 2828819.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 178.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396779716014862, "kl": 0.018114380538463593, "learning_rate": 1.2866666666666665e-07, "loss": 0.001, "num_tokens": 2829109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 178.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.19356422126293182, "kl": 0.02031507482752204, "learning_rate": 1.2833333333333333e-07, "loss": 0.0011, "num_tokens": 2829429.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 178.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.058159466832876205, "kl": 0.0021013430086895823, "learning_rate": 1.28e-07, "loss": 0.0001, "num_tokens": 2829723.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 178.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.027771631255745888, "kl": 0.03797489311546087, "learning_rate": 1.2766666666666666e-07, "loss": 0.0019, "num_tokens": 2830128.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 178.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.010564511641860008, "kl": 0.000665765255689621, "learning_rate": 1.2733333333333334e-07, "loss": 0.0, "num_tokens": 2830388.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 178.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.028403762727975845, "kl": 0.004304815316572785, "learning_rate": 1.27e-07, "loss": 0.0002, "num_tokens": 2830715.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008022031397558749, "kl": 0.0013871045666746795, "learning_rate": 1.2666666666666666e-07, "loss": 0.0001, "num_tokens": 2830989.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 178.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.2146918773651123, "kl": 0.08477691747248173, "learning_rate": 1.2633333333333334e-07, "loss": 0.0643, "num_tokens": 2831322.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 178.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.028722399845719337, "kl": 0.004485728684812784, "learning_rate": 1.2600000000000002e-07, "loss": 0.0002, "num_tokens": 2831620.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 178.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.14467935264110565, "kl": 0.16932255029678345, "learning_rate": 1.2566666666666667e-07, "loss": 0.0085, "num_tokens": 2831936.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.033596307039260864, "kl": 0.0013881406630389392, "learning_rate": 1.2533333333333332e-07, "loss": 0.0001, "num_tokens": 2832208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 178.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.014502130448818207, "kl": 0.2659769505262375, "learning_rate": 1.25e-07, "loss": 0.0133, "num_tokens": 2832512.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 178.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.6548380851745605, "kl": 0.02610295871272683, "learning_rate": 1.2466666666666667e-07, "loss": -0.0137, "num_tokens": 2832817.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 178.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.011660748161375523, "kl": 0.00016899704860406928, "learning_rate": 1.2433333333333332e-07, "loss": 0.0, "num_tokens": 2833073.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 178.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06711766123771667, "kl": 0.003436968778260052, "learning_rate": 1.24e-07, "loss": 0.0002, "num_tokens": 2833340.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 178.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02088790573179722, "kl": 0.0003516205833875574, "learning_rate": 1.2366666666666668e-07, "loss": 0.0, "num_tokens": 2833562.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 178.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.5831379890441895, "kl": 0.18136628530919552, "learning_rate": 1.2333333333333335e-07, "loss": -0.0158, "num_tokens": 2833920.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 178.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.022614121437072754, "kl": 0.001610791718121618, "learning_rate": 1.23e-07, "loss": 0.0001, "num_tokens": 2834180.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 178.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02911880612373352, "kl": 0.00682230263191741, "learning_rate": 1.2266666666666668e-07, "loss": 0.0003, "num_tokens": 2834452.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.7033159136772156, "kl": 0.0785313555970788, "learning_rate": 1.2233333333333336e-07, "loss": 0.0067, "num_tokens": 2834717.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 178.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 8.678153991699219, "kl": 0.010550772189162672, "learning_rate": 1.2199999999999998e-07, "loss": 0.2468, "num_tokens": 2834937.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 178.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.024378197267651558, "kl": 0.005471828859299421, "learning_rate": 1.2166666666666666e-07, "loss": 0.0003, "num_tokens": 2835278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.017940865829586983, "kl": 0.0008927244052756578, "learning_rate": 1.2133333333333333e-07, "loss": 0.0, "num_tokens": 2835560.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 178.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.025685062631964684, "kl": 0.0010303754970664158, "learning_rate": 1.21e-07, "loss": 0.0001, "num_tokens": 2835872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 178.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.12392605096101761, "kl": 0.03731744363903999, "learning_rate": 1.2066666666666666e-07, "loss": 0.0019, "num_tokens": 2836174.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 7.473946880054427e-06, "kl": 1.9222497940063477e-06, "learning_rate": 1.2033333333333334e-07, "loss": 0.0, "num_tokens": 2836394.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 178.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.012653307989239693, "kl": 0.0018477363046258688, "learning_rate": 1.2000000000000002e-07, "loss": 0.0001, "num_tokens": 2836708.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03158792480826378, "kl": 0.019087360240519047, "learning_rate": 1.1966666666666667e-07, "loss": 0.0011, "num_tokens": 2836988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 178.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05164724215865135, "kl": 0.0016252377245109528, "learning_rate": 1.1933333333333334e-07, "loss": 0.0001, "num_tokens": 2837222.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9643 }, { "clip_ratio/high_max": 0.01515151560306549, "clip_ratio/high_mean": 0.01515151560306549, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01515151560306549, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 178.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 12.528735160827637, "kl": 0.027745794504880905, "learning_rate": 1.1900000000000001e-07, "loss": 0.1967, "num_tokens": 2837502.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 178.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.5991106033325195, "kl": 0.1108491700142622, "learning_rate": 1.1866666666666666e-07, "loss": 0.2138, "num_tokens": 2837811.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 178.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.018112411722540855, "kl": 0.012603074312210083, "learning_rate": 1.1833333333333333e-07, "loss": 0.0006, "num_tokens": 2838071.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 178.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.012119478546082973, "kl": 0.001166579604614526, "learning_rate": 1.18e-07, "loss": 0.0001, "num_tokens": 2838345.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 178.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03568000718951225, "kl": 0.009528001770377159, "learning_rate": 1.1766666666666666e-07, "loss": 0.0005, "num_tokens": 2838694.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 178.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.003912889398634434, "kl": 0.00022037699818611145, "learning_rate": 1.1733333333333334e-07, "loss": 0.0, "num_tokens": 2838938.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 178.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06269144266843796, "kl": 0.020347768906503916, "learning_rate": 1.17e-07, "loss": 0.001, "num_tokens": 2839210.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 178.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03186080977320671, "kl": 0.00934227230027318, "learning_rate": 1.1666666666666667e-07, "loss": 0.0005, "num_tokens": 2839529.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 178.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03496990725398064, "kl": 0.017681284807622433, "learning_rate": 1.1633333333333334e-07, "loss": 0.0008, "num_tokens": 2839856.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 178.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.15210020542144775, "kl": 0.019458720460534096, "learning_rate": 1.16e-07, "loss": 0.001, "num_tokens": 2840140.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 178.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.052592452615499496, "kl": 0.05388986878097057, "learning_rate": 1.1566666666666668e-07, "loss": 0.0027, "num_tokens": 2840485.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 178.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10374416410923004, "kl": 0.030501834116876125, "learning_rate": 1.1533333333333335e-07, "loss": 0.0016, "num_tokens": 2840797.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.022376513108611107, "kl": 0.0018261033692397177, "learning_rate": 1.15e-07, "loss": 0.0001, "num_tokens": 2841093.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 178.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.010112726129591465, "kl": 0.00042986913467757404, "learning_rate": 1.1466666666666666e-07, "loss": 0.0, "num_tokens": 2841411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.001916148466989398, "kl": 1.3880431652069092e-05, "learning_rate": 1.1433333333333332e-07, "loss": 0.0, "num_tokens": 2841623.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 178.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.006173481699079275, "kl": 0.00018824636936187744, "learning_rate": 1.14e-07, "loss": 0.0, "num_tokens": 2841831.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.012239635922014713, "kl": 0.000723128963727504, "learning_rate": 1.1366666666666667e-07, "loss": 0.0, "num_tokens": 2842099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018316024215891957, "kl": 0.0035710036754608154, "learning_rate": 1.1333333333333334e-07, "loss": 0.0002, "num_tokens": 2842335.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 178.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027970748487859964, "kl": 0.0003119918255833909, "learning_rate": 1.13e-07, "loss": 0.0, "num_tokens": 2842597.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 178.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.01072064321488142, "kl": 0.007303948746994138, "learning_rate": 1.1266666666666667e-07, "loss": 0.0004, "num_tokens": 2842869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 178.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.023765217512845993, "kl": 0.07397115416824818, "learning_rate": 1.1233333333333335e-07, "loss": 0.0037, "num_tokens": 2843239.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 178.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06605005264282227, "kl": 0.020146341994404793, "learning_rate": 1.1200000000000001e-07, "loss": 0.001, "num_tokens": 2843541.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 179.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.9219770431518555, "kl": 0.07599946111440659, "learning_rate": 1.1166666666666666e-07, "loss": 0.0102, "num_tokens": 2843909.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 179.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018040143186226487, "kl": 0.0035718977451324463, "learning_rate": 1.1133333333333332e-07, "loss": 0.0002, "num_tokens": 2844145.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 179.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.014750920236110687, "kl": 0.0017863232642412186, "learning_rate": 1.11e-07, "loss": 0.0001, "num_tokens": 2844457.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.11244037002325058, "kl": 0.006736365205142647, "learning_rate": 1.1066666666666667e-07, "loss": 0.0003, "num_tokens": 2844753.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 179.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.001369985518977046, "kl": 3.6619603633880615e-05, "learning_rate": 1.1033333333333333e-07, "loss": 0.0, "num_tokens": 2844965.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 179.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.08508226275444031, "kl": 0.02371902298182249, "learning_rate": 1.1e-07, "loss": 0.0012, "num_tokens": 2845238.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 179.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.057099487632513046, "kl": 0.007750753313302994, "learning_rate": 1.0966666666666667e-07, "loss": 0.0004, "num_tokens": 2845531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 179.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.30677762627601624, "kl": 0.031641120091080666, "learning_rate": 1.0933333333333335e-07, "loss": 0.0019, "num_tokens": 2845808.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 179.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927388146519661, "kl": 0.004070740658789873, "learning_rate": 1.0900000000000001e-07, "loss": 0.0002, "num_tokens": 2846064.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 179.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.002588885836303234, "kl": 0.0004719384014606476, "learning_rate": 1.0866666666666667e-07, "loss": 0.0, "num_tokens": 2846324.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 179.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02736266329884529, "kl": 0.0034303624415770173, "learning_rate": 1.0833333333333332e-07, "loss": 0.0002, "num_tokens": 2846663.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.112271308898926, "kl": 0.04132060831761919, "learning_rate": 1.08e-07, "loss": 0.0487, "num_tokens": 2846948.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 9677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 179.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.017123788595199585, "kl": 0.012875073589384556, "learning_rate": 1.0766666666666666e-07, "loss": 0.0006, "num_tokens": 2847208.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 179.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.005060833413153887, "kl": 0.0001884127632365562, "learning_rate": 1.0733333333333333e-07, "loss": 0.0, "num_tokens": 2847480.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 179.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.10962368547916412, "kl": 0.018941693706437945, "learning_rate": 1.07e-07, "loss": 0.001, "num_tokens": 2847804.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 179.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03470403328537941, "kl": 0.0005734115839004517, "learning_rate": 1.0666666666666667e-07, "loss": 0.0, "num_tokens": 2848048.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 179.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08530457317829132, "kl": 0.034948455169796944, "learning_rate": 1.0633333333333333e-07, "loss": 0.0017, "num_tokens": 2848383.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02458912320435047, "kl": 0.0122128298971802, "learning_rate": 1.0600000000000001e-07, "loss": 0.0007, "num_tokens": 2848657.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 179.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.04479043185710907, "kl": 0.035796113312244415, "learning_rate": 1.0566666666666667e-07, "loss": 0.0018, "num_tokens": 2848959.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 179.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07037424296140671, "kl": 0.006918626604601741, "learning_rate": 1.0533333333333335e-07, "loss": 0.0003, "num_tokens": 2849221.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 179.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.021594589576125145, "kl": 0.0006519470916828141, "learning_rate": 1.0500000000000001e-07, "loss": 0.0, "num_tokens": 2849456.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 179.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.08782421797513962, "kl": 0.01188184879720211, "learning_rate": 1.0466666666666666e-07, "loss": 0.0005, "num_tokens": 2849728.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 179.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.015624243766069412, "kl": 0.0013450205442495644, "learning_rate": 1.0433333333333333e-07, "loss": 0.0001, "num_tokens": 2849988.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007462686393409967, "clip_ratio/low_min": 0.007462686393409967, "clip_ratio/region_mean": 0.007462686393409967, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 179.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.651930809020996, "kl": 0.027714114636182785, "learning_rate": 1.0399999999999999e-07, "loss": 0.1709, "num_tokens": 2850335.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 179.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03454925864934921, "kl": 0.025273829407524318, "learning_rate": 1.0366666666666667e-07, "loss": 0.0013, "num_tokens": 2850624.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 179.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.3167155385017395, "kl": 0.0384307811036706, "learning_rate": 1.0333333333333333e-07, "loss": 0.002, "num_tokens": 2850927.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 179.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.102170944213867, "kl": 0.1525644026696682, "learning_rate": 1.0300000000000001e-07, "loss": 0.0182, "num_tokens": 2851301.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 179.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02888142503798008, "kl": 0.0022603245452046394, "learning_rate": 1.0266666666666667e-07, "loss": 0.0001, "num_tokens": 2851601.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 179.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.014978638850152493, "kl": 0.26587051153182983, "learning_rate": 1.0233333333333334e-07, "loss": 0.0133, "num_tokens": 2851905.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00048795342445373535, "kl": 0.0013157953508198261, "learning_rate": 1.0200000000000001e-07, "loss": 0.0001, "num_tokens": 2852182.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 179.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.026828717440366745, "kl": 0.04668557830154896, "learning_rate": 1.0166666666666668e-07, "loss": 0.0023, "num_tokens": 2852515.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 179.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.440510272979736, "kl": 0.42124919034540653, "learning_rate": 1.0133333333333333e-07, "loss": 0.2074, "num_tokens": 2852813.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 9697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 179.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.08777348697185516, "kl": 0.0058256154879927635, "learning_rate": 1.0099999999999999e-07, "loss": 0.0004, "num_tokens": 2853028.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 179.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.23188771307468414, "kl": 0.07376392185688019, "learning_rate": 1.0066666666666667e-07, "loss": 0.0034, "num_tokens": 2853336.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 179.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.1250053197145462, "kl": 0.008664275519549847, "learning_rate": 1.0033333333333333e-07, "loss": 0.0004, "num_tokens": 2853662.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083100438117981, "kl": 0.004676389042288065, "learning_rate": 1e-07, "loss": 0.0002, "num_tokens": 2853923.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 179.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.007782944943755865, "kl": 0.16182966530323029, "learning_rate": 9.966666666666667e-08, "loss": 0.0081, "num_tokens": 2854232.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 179.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.003936781547963619, "kl": 0.0013590991293312982, "learning_rate": 9.933333333333334e-08, "loss": 0.0001, "num_tokens": 2854451.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 179.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0384286493062973, "kl": 0.005321982316672802, "learning_rate": 9.900000000000001e-08, "loss": 0.0003, "num_tokens": 2854788.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 179.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.4603114426136017, "kl": 0.0931578278541565, "learning_rate": 9.866666666666668e-08, "loss": 0.0047, "num_tokens": 2855167.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 179.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.10489687323570251, "kl": 0.0028638184594456106, "learning_rate": 9.833333333333334e-08, "loss": 0.0002, "num_tokens": 2855494.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 179.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.2989505529403687, "kl": 0.13962608575820923, "learning_rate": 9.799999999999999e-08, "loss": 0.0296, "num_tokens": 2855902.0, "reward": 2.875, "reward_std": 0.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 0.25, "step": 9707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 179.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.047474656254053116, "kl": 0.006973778363317251, "learning_rate": 9.766666666666665e-08, "loss": 0.0003, "num_tokens": 2856208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 179.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.002819453366100788, "kl": 0.0001382927075610496, "learning_rate": 9.733333333333333e-08, "loss": 0.0, "num_tokens": 2856520.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 179.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.022756028920412064, "kl": 0.0007744921022094786, "learning_rate": 9.7e-08, "loss": 0.0, "num_tokens": 2856840.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 179.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03172854706645012, "kl": 0.005972646409645677, "learning_rate": 9.666666666666667e-08, "loss": 0.0003, "num_tokens": 2857128.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 179.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.7800384163856506, "kl": 0.14649273082613945, "learning_rate": 9.633333333333334e-08, "loss": 0.0069, "num_tokens": 2857429.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 179.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.004654505755752325, "kl": 0.000758931040763855, "learning_rate": 9.6e-08, "loss": 0.0, "num_tokens": 2857645.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.047067224979400635, "kl": 0.003557793330401182, "learning_rate": 9.566666666666668e-08, "loss": 0.0002, "num_tokens": 2857918.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 179.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 3.60748017556034e-05, "kl": 2.1085143089294434e-06, "learning_rate": 9.533333333333334e-08, "loss": 0.0, "num_tokens": 2858138.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 179.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05438502877950668, "kl": 0.0047757600113982335, "learning_rate": 9.500000000000002e-08, "loss": 0.0002, "num_tokens": 2858398.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 179.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02562764845788479, "kl": 0.09701839089393616, "learning_rate": 9.466666666666668e-08, "loss": 0.0049, "num_tokens": 2858770.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 179.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.031806401908397675, "kl": 0.019666369073092937, "learning_rate": 9.433333333333333e-08, "loss": 0.001, "num_tokens": 2859143.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 179.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.11035704612731934, "kl": 0.011846620822325349, "learning_rate": 9.4e-08, "loss": 0.0007, "num_tokens": 2859430.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 180.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.05384654179215431, "kl": 0.0030358732328750193, "learning_rate": 9.366666666666666e-08, "loss": 0.0001, "num_tokens": 2859704.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 180.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.015693986788392067, "kl": 0.0007912683067843318, "learning_rate": 9.333333333333334e-08, "loss": 0.0, "num_tokens": 2860033.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 180.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.8564913272857666, "kl": 0.13217679783701897, "learning_rate": 9.3e-08, "loss": 0.0179, "num_tokens": 2860378.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 9722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 180.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.014074375852942467, "kl": 0.1599755361676216, "learning_rate": 9.266666666666668e-08, "loss": 0.008, "num_tokens": 2860688.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 180.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 6.516552448272705, "kl": 0.11829948239028454, "learning_rate": 9.233333333333334e-08, "loss": 0.1751, "num_tokens": 2860980.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 180.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 1.924603819847107, "kl": 0.1877682562917471, "learning_rate": 9.2e-08, "loss": 0.0097, "num_tokens": 2861351.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 180.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.030710916966199875, "kl": 0.0040566254465375096, "learning_rate": 9.166666666666667e-08, "loss": 0.0002, "num_tokens": 2861611.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009462040616199374, "kl": 4.547089338302612e-05, "learning_rate": 9.133333333333333e-08, "loss": 0.0, "num_tokens": 2861823.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 7.332174301147461, "kl": 0.1630617007613182, "learning_rate": 9.100000000000001e-08, "loss": 0.1538, "num_tokens": 2862079.0, "reward": 2.0, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 1.7320507764816284, "step": 9728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 180.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03276969864964485, "kl": 0.009042461810167879, "learning_rate": 9.066666666666667e-08, "loss": 0.0005, "num_tokens": 2862401.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 180.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.017010599374771118, "kl": 0.012413645163178444, "learning_rate": 9.033333333333333e-08, "loss": 0.0006, "num_tokens": 2862715.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 180.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.15554964542388916, "kl": 0.021025316091254354, "learning_rate": 9e-08, "loss": 0.0013, "num_tokens": 2862980.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 180.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01840536668896675, "kl": 0.0016925626550801098, "learning_rate": 8.966666666666666e-08, "loss": 0.0001, "num_tokens": 2863252.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017756319139152765, "kl": 0.0035761669278144836, "learning_rate": 8.933333333333334e-08, "loss": 0.0002, "num_tokens": 2863488.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 180.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.026851674541831017, "kl": 0.0009266337146982551, "learning_rate": 8.9e-08, "loss": 0.0001, "num_tokens": 2863704.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 180.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.642868518829346, "kl": 0.00667224545031786, "learning_rate": 8.866666666666668e-08, "loss": 0.0905, "num_tokens": 2863984.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 180.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02941243164241314, "kl": 0.01439197943545878, "learning_rate": 8.833333333333333e-08, "loss": 0.0008, "num_tokens": 2864276.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 180.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.903420448303223, "kl": 0.33060973044484854, "learning_rate": 8.800000000000001e-08, "loss": 0.1991, "num_tokens": 2864582.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 180.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 7.891350746154785, "kl": 0.04278185838484205, "learning_rate": 8.766666666666667e-08, "loss": 0.1731, "num_tokens": 2864821.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 180.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06596143543720245, "kl": 0.028216956183314323, "learning_rate": 8.733333333333333e-08, "loss": 0.0014, "num_tokens": 2865143.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.004152251407504082, "kl": 0.0014495551586151123, "learning_rate": 8.700000000000001e-08, "loss": 0.0001, "num_tokens": 2865359.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 180.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.743297100067139, "kl": 0.019787953235208988, "learning_rate": 8.666666666666666e-08, "loss": 0.1181, "num_tokens": 2865668.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 180.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0577881820499897, "kl": 0.003214998869225383, "learning_rate": 8.633333333333334e-08, "loss": 0.0002, "num_tokens": 2865966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 180.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.420711517333984, "kl": 0.024049285799264908, "learning_rate": 8.6e-08, "loss": 0.0858, "num_tokens": 2866245.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 9743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 180.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03405766934156418, "kl": 0.00945271854288876, "learning_rate": 8.566666666666667e-08, "loss": 0.0004, "num_tokens": 2866531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 180.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04872254282236099, "kl": 0.019486029166728258, "learning_rate": 8.533333333333334e-08, "loss": 0.001, "num_tokens": 2866831.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 180.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00877660047262907, "kl": 0.0005549965426325798, "learning_rate": 8.5e-08, "loss": 0.0, "num_tokens": 2867113.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 89.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 89.25, "completions/mean_terminated_length": 33.66666793823242, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 180.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.464437484741211, "kl": 0.039972422644495964, "learning_rate": 8.466666666666667e-08, "loss": 0.3621, "num_tokens": 2867694.0, "reward": 2.799999952316284, "reward_std": 5.770037651062012, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 5.770037651062012, "step": 9747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 180.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04211128503084183, "kl": 0.010945403948426247, "learning_rate": 8.433333333333333e-08, "loss": 0.0005, "num_tokens": 2867985.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 180.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04447166994214058, "kl": 0.004647494293749332, "learning_rate": 8.4e-08, "loss": 0.0002, "num_tokens": 2868275.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 180.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02404971234500408, "kl": 0.09680220484733582, "learning_rate": 8.366666666666667e-08, "loss": 0.0048, "num_tokens": 2868647.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 180.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11517985910177231, "kl": 0.0034321162675041705, "learning_rate": 8.333333333333333e-08, "loss": 0.0002, "num_tokens": 2868973.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 180.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.05838199704885483, "kl": 0.0018817521631717682, "learning_rate": 8.3e-08, "loss": 0.0001, "num_tokens": 2869217.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 180.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.005811711773276329, "kl": 0.00024369855236727744, "learning_rate": 8.266666666666667e-08, "loss": 0.0, "num_tokens": 2869489.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 180.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.01925094798207283, "kl": 0.002049759670626372, "learning_rate": 8.233333333333334e-08, "loss": 0.0001, "num_tokens": 2869759.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 180.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.0649664402008057, "kl": 0.8399522602558136, "learning_rate": 8.2e-08, "loss": 0.0682, "num_tokens": 2870065.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 9755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.013387484475970268, "kl": 0.0025460347533226013, "learning_rate": 8.166666666666667e-08, "loss": 0.0001, "num_tokens": 2870283.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 180.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.028486497700214386, "kl": 0.004458521492779255, "learning_rate": 8.133333333333333e-08, "loss": 0.0002, "num_tokens": 2870613.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 180.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.02511739730835, "kl": 0.030264260014519095, "learning_rate": 8.1e-08, "loss": 0.0892, "num_tokens": 2870951.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 180.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.6431829929351807, "kl": 0.38218337297439575, "learning_rate": 8.066666666666667e-08, "loss": 0.061, "num_tokens": 2871320.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 9759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 180.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.02280070260167122, "kl": 0.004805322969332337, "learning_rate": 8.033333333333334e-08, "loss": 0.0002, "num_tokens": 2871602.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 180.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015851635253056884, "kl": 0.00014714333519805223, "learning_rate": 8e-08, "loss": 0.0, "num_tokens": 2871916.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 180.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06560209393501282, "kl": 0.010839097434654832, "learning_rate": 7.966666666666667e-08, "loss": 0.0005, "num_tokens": 2872216.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 180.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.011354167014360428, "kl": 0.0001378953456878662, "learning_rate": 7.933333333333333e-08, "loss": 0.0, "num_tokens": 2872472.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 180.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07772146910429001, "kl": 0.04510589502751827, "learning_rate": 7.9e-08, "loss": 0.0021, "num_tokens": 2872887.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 180.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.04635665565729141, "kl": 0.03606859967112541, "learning_rate": 7.866666666666667e-08, "loss": 0.0018, "num_tokens": 2873187.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 180.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 8.049996376037598, "kl": 0.7344900369644165, "learning_rate": 7.833333333333335e-08, "loss": -0.0052, "num_tokens": 2873529.0, "reward": 4.125, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.308422088623047, "step": 9766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 180.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.048285603523254395, "kl": 0.006073974538594484, "learning_rate": 7.8e-08, "loss": 0.0003, "num_tokens": 2873797.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 180.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.042340707033872604, "kl": 0.0013536736369132996, "learning_rate": 7.766666666666666e-08, "loss": 0.0001, "num_tokens": 2874057.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 7.848901987017598e-06, "kl": 1.996755599975586e-06, "learning_rate": 7.733333333333334e-08, "loss": 0.0, "num_tokens": 2874277.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 180.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03038335032761097, "kl": 0.007244020933285356, "learning_rate": 7.7e-08, "loss": 0.0003, "num_tokens": 2874630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 180.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.020245898514986038, "kl": 0.00036829710006713867, "learning_rate": 7.666666666666668e-08, "loss": 0.0, "num_tokens": 2874834.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 180.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01662726141512394, "kl": 0.0015327015426009893, "learning_rate": 7.633333333333333e-08, "loss": 0.0001, "num_tokens": 2875116.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 180.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1702125370502472, "kl": 0.01502043369691819, "learning_rate": 7.6e-08, "loss": 0.0007, "num_tokens": 2875384.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 181.0, "frac_reward_zero_std": 1.0, "grad_norm": 1.9899954795837402, "kl": 0.3369261724874377, "learning_rate": 7.566666666666667e-08, "loss": 0.0177, "num_tokens": 2875645.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 181.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.43036413192749, "kl": 0.10908255726099014, "learning_rate": 7.533333333333333e-08, "loss": 0.0582, "num_tokens": 2875891.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 181.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03449322283267975, "kl": 0.0026401603827252984, "learning_rate": 7.500000000000001e-08, "loss": 0.0001, "num_tokens": 2876192.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019202911062166095, "kl": 0.0035481080412864685, "learning_rate": 7.466666666666666e-08, "loss": 0.0002, "num_tokens": 2876428.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 181.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 7.282996654510498, "kl": 0.028393109212629497, "learning_rate": 7.433333333333334e-08, "loss": 0.1089, "num_tokens": 2876721.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 9778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 181.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.08331066370010376, "kl": 0.01894476218149066, "learning_rate": 7.4e-08, "loss": 0.0011, "num_tokens": 2877005.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 181.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.7062828540802, "kl": 0.04134655185043812, "learning_rate": 7.366666666666666e-08, "loss": 0.0718, "num_tokens": 2877329.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 181.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.049431804567575455, "kl": 0.032853782176971436, "learning_rate": 7.333333333333334e-08, "loss": 0.0016, "num_tokens": 2877677.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 181.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.035436488687992096, "kl": 0.004036555823404342, "learning_rate": 7.299999999999999e-08, "loss": 0.0002, "num_tokens": 2877935.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 181.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.2851065397262573, "kl": 0.024580217897892, "learning_rate": 7.266666666666667e-08, "loss": 0.0015, "num_tokens": 2878197.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.027735114097595215, "kl": 0.0002457946538925171, "learning_rate": 7.233333333333333e-08, "loss": 0.0, "num_tokens": 2878409.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 181.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.029226010665297508, "kl": 0.0027967566275037825, "learning_rate": 7.200000000000001e-08, "loss": 0.0001, "num_tokens": 2878675.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 181.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.005747013725340366, "kl": 0.16158832609653473, "learning_rate": 7.166666666666667e-08, "loss": 0.0081, "num_tokens": 2878984.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 181.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.3371148109436035, "kl": 0.3379513509571552, "learning_rate": 7.133333333333334e-08, "loss": 0.0063, "num_tokens": 2879324.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 4.05634636990726e-05, "kl": 1.7583370208740234e-06, "learning_rate": 7.1e-08, "loss": 0.0, "num_tokens": 2879544.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.011363636702299118, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 181.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.3449803590774536, "kl": 0.3724980056285858, "learning_rate": 7.066666666666666e-08, "loss": 0.1351, "num_tokens": 2879860.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 9789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1281738132238388, "kl": 0.006687208544462919, "learning_rate": 7.033333333333334e-08, "loss": 0.0004, "num_tokens": 2880087.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 181.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003145444788970053, "kl": 0.0012390486081130803, "learning_rate": 7e-08, "loss": 0.0001, "num_tokens": 2880367.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 181.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.9833400249481201, "kl": 0.25872868299484253, "learning_rate": 6.966666666666667e-08, "loss": 0.0064, "num_tokens": 2880733.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 181.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0057641384191811085, "kl": 7.436275336658582e-05, "learning_rate": 6.933333333333333e-08, "loss": 0.0, "num_tokens": 2880989.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 181.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.007799781858921051, "kl": 0.0011353492736816406, "learning_rate": 6.9e-08, "loss": 0.0001, "num_tokens": 2881249.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 181.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.052825137972831726, "kl": 0.0035371724516153336, "learning_rate": 6.866666666666667e-08, "loss": 0.0002, "num_tokens": 2881549.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 181.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 4.983455657958984, "kl": 0.12283117696642876, "learning_rate": 6.833333333333334e-08, "loss": -0.0039, "num_tokens": 2881873.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 9796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 181.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.013385354541242123, "kl": 0.0004972443566657603, "learning_rate": 6.8e-08, "loss": 0.0, "num_tokens": 2882191.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 181.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.04007229208946228, "kl": 0.09807289391756058, "learning_rate": 6.766666666666666e-08, "loss": 0.0049, "num_tokens": 2882563.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 181.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04402105510234833, "kl": 0.009249531663954258, "learning_rate": 6.733333333333333e-08, "loss": 0.0005, "num_tokens": 2882852.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 181.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05748549476265907, "kl": 0.00761270709335804, "learning_rate": 6.7e-08, "loss": 0.0004, "num_tokens": 2883142.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 181.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.044741880148649216, "kl": 0.007926232647150755, "learning_rate": 6.666666666666667e-08, "loss": 0.0004, "num_tokens": 2883488.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 181.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.025197431445121765, "kl": 0.005192175507545471, "learning_rate": 6.633333333333334e-08, "loss": 0.0003, "num_tokens": 2883756.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 181.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01790686324238777, "kl": 0.0023043788969516754, "learning_rate": 6.6e-08, "loss": 0.0001, "num_tokens": 2884068.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 181.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.003130113473162055, "kl": 0.0002112908405251801, "learning_rate": 6.566666666666667e-08, "loss": 0.0, "num_tokens": 2884377.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 181.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.9966764450073242, "kl": 0.10453439690172672, "learning_rate": 6.533333333333334e-08, "loss": -0.0288, "num_tokens": 2884779.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 9805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 181.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.026720933616161346, "kl": 0.0009430637292098254, "learning_rate": 6.5e-08, "loss": 0.0, "num_tokens": 2885049.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.017182189971208572, "kl": 0.0002554208040237427, "learning_rate": 6.466666666666668e-08, "loss": 0.0, "num_tokens": 2885261.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 181.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.1586709022521973, "kl": 0.1482773907482624, "learning_rate": 6.433333333333333e-08, "loss": -0.0462, "num_tokens": 2885534.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 181.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.106722593307495, "kl": 0.03738443832844496, "learning_rate": 6.4e-08, "loss": 0.0009, "num_tokens": 2885834.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 181.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.21426236629486084, "kl": 0.09430229105055332, "learning_rate": 6.366666666666667e-08, "loss": 0.0047, "num_tokens": 2886172.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 181.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.051380306482315063, "kl": 0.011238531209528446, "learning_rate": 6.333333333333333e-08, "loss": 0.0006, "num_tokens": 2886500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 181.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.027584191411733627, "kl": 0.0011591293732635677, "learning_rate": 6.300000000000001e-08, "loss": 0.0001, "num_tokens": 2886734.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 181.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.049617331475019455, "kl": 0.013029744965024292, "learning_rate": 6.266666666666666e-08, "loss": 0.0007, "num_tokens": 2887020.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 181.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06499826908111572, "kl": 0.0023485174169763923, "learning_rate": 6.233333333333334e-08, "loss": 0.0001, "num_tokens": 2887236.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 181.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.09056086838245392, "kl": 0.005128631251864135, "learning_rate": 6.2e-08, "loss": 0.0003, "num_tokens": 2887508.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 181.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.12840566039085388, "kl": 0.01657271245494485, "learning_rate": 6.166666666666668e-08, "loss": 0.0008, "num_tokens": 2887799.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 181.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.16025394201278687, "kl": 0.02720168326050043, "learning_rate": 6.133333333333334e-08, "loss": 0.0015, "num_tokens": 2888095.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 181.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.19129562377929688, "kl": 0.03511792724020779, "learning_rate": 6.099999999999999e-08, "loss": 0.0021, "num_tokens": 2888438.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 181.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.05932667851448059, "kl": 0.012376388534903526, "learning_rate": 6.066666666666667e-08, "loss": 0.0006, "num_tokens": 2888767.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 181.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012764434795826674, "kl": 0.00033229589462280273, "learning_rate": 6.033333333333333e-08, "loss": 0.0, "num_tokens": 2889027.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 181.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 1.832655429840088, "kl": 0.14973975904285908, "learning_rate": 6.000000000000001e-08, "loss": -0.0326, "num_tokens": 2889395.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 181.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03700793534517288, "kl": 0.006786672165617347, "learning_rate": 5.966666666666667e-08, "loss": 0.0003, "num_tokens": 2889673.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 181.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.8758647441864014, "kl": 0.03196819685399532, "learning_rate": 5.933333333333333e-08, "loss": 0.2018, "num_tokens": 2889998.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 181.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07538042217493057, "kl": 0.0030323906103149056, "learning_rate": 5.9e-08, "loss": 0.0002, "num_tokens": 2890269.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 181.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.022711871191859245, "kl": 0.0019327686168253422, "learning_rate": 5.866666666666667e-08, "loss": 0.0001, "num_tokens": 2890565.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.023565754294395447, "kl": 0.0006713151960866526, "learning_rate": 5.833333333333333e-08, "loss": 0.0, "num_tokens": 2890784.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 181.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07558811455965042, "kl": 0.0033678172621876, "learning_rate": 5.8e-08, "loss": 0.0001, "num_tokens": 2891054.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 182.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01791425794363022, "kl": 0.012602954637259245, "learning_rate": 5.7666666666666673e-08, "loss": 0.0006, "num_tokens": 2891314.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 182.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.027833988890051842, "kl": 0.000940871424973011, "learning_rate": 5.733333333333333e-08, "loss": 0.0, "num_tokens": 2891549.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 182.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04389351233839989, "kl": 0.009435498155653477, "learning_rate": 5.7e-08, "loss": 0.0005, "num_tokens": 2891838.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.050671618431806564, "kl": 0.0014690712559968233, "learning_rate": 5.666666666666667e-08, "loss": 0.0001, "num_tokens": 2892057.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 182.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09843046963214874, "kl": 0.02210741490125656, "learning_rate": 5.6333333333333335e-08, "loss": 0.0011, "num_tokens": 2892358.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.046263426542282104, "kl": 0.0021884179150220007, "learning_rate": 5.6000000000000005e-08, "loss": 0.0001, "num_tokens": 2892656.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 182.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.019380565732717514, "kl": 0.0006750524044036865, "learning_rate": 5.566666666666666e-08, "loss": 0.0, "num_tokens": 2892868.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 182.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03707139194011688, "kl": 0.0010180601675529033, "learning_rate": 5.533333333333333e-08, "loss": 0.0001, "num_tokens": 2893140.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 182.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0200099665671587, "kl": 0.0006193131339387037, "learning_rate": 5.5e-08, "loss": 0.0, "num_tokens": 2893396.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 182.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.7514722943305969, "kl": 0.09609995130449533, "learning_rate": 5.466666666666667e-08, "loss": 0.005, "num_tokens": 2893740.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 182.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.2288810759782791, "kl": 0.056039443239569664, "learning_rate": 5.433333333333334e-08, "loss": 0.003, "num_tokens": 2894098.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 182.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.13364006578922272, "kl": 0.01917952485382557, "learning_rate": 5.4e-08, "loss": 0.0009, "num_tokens": 2894490.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 182.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.017981266602873802, "kl": 0.04103469289839268, "learning_rate": 5.3666666666666664e-08, "loss": 0.002, "num_tokens": 2894895.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 182.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.005609320942312479, "kl": 0.0006834566593170166, "learning_rate": 5.3333333333333334e-08, "loss": 0.0, "num_tokens": 2895179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 182.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.017934110015630722, "kl": 0.0007517827034462243, "learning_rate": 5.3000000000000005e-08, "loss": 0.0, "num_tokens": 2895501.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.2706718444824219, "kl": 0.02328595519065857, "learning_rate": 5.2666666666666675e-08, "loss": 0.0012, "num_tokens": 2895717.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.16941288113594055, "kl": 0.012295078253373504, "learning_rate": 5.233333333333333e-08, "loss": 0.0006, "num_tokens": 2895980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 182.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06400128453969955, "kl": 0.0343914981931448, "learning_rate": 5.1999999999999996e-08, "loss": 0.0017, "num_tokens": 2896293.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 182.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.07269112765789032, "kl": 0.0035394877195358276, "learning_rate": 5.1666666666666666e-08, "loss": 0.0002, "num_tokens": 2896553.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 182.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.020313585177063942, "kl": 0.0014387592382263392, "learning_rate": 5.1333333333333336e-08, "loss": 0.0001, "num_tokens": 2896831.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 182.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.010828782804310322, "kl": 0.007247310597449541, "learning_rate": 5.100000000000001e-08, "loss": 0.0004, "num_tokens": 2897103.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 182.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 2.221320867538452, "kl": 0.32922819582745433, "learning_rate": 5.0666666666666664e-08, "loss": 0.0187, "num_tokens": 2897366.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 182.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028008262161165476, "kl": 0.00014487239241134375, "learning_rate": 5.0333333333333334e-08, "loss": 0.0, "num_tokens": 2897678.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 182.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.9942631721496582, "kl": 0.2854239344596863, "learning_rate": 5e-08, "loss": 0.0144, "num_tokens": 2897988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 182.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.08285778760910034, "kl": 0.024885154329240322, "learning_rate": 4.966666666666667e-08, "loss": 0.0013, "num_tokens": 2898276.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 182.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12954916059970856, "kl": 0.07800208777189255, "learning_rate": 4.933333333333334e-08, "loss": 0.0039, "num_tokens": 2898662.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.038082655519247055, "kl": 0.0021064550091978163, "learning_rate": 4.8999999999999995e-08, "loss": 0.0001, "num_tokens": 2898916.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 182.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.2266039848327637, "kl": 0.005796613288111985, "learning_rate": 4.8666666666666666e-08, "loss": -0.0142, "num_tokens": 2899208.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02271237038075924, "kl": 0.0020514721982181072, "learning_rate": 4.8333333333333336e-08, "loss": 0.0001, "num_tokens": 2899485.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.25, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 182.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.9496564865112305, "kl": 0.07561144791543484, "learning_rate": 4.8e-08, "loss": 0.1046, "num_tokens": 2899894.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.07119078934192657, "kl": 0.009405347518622875, "learning_rate": 4.766666666666667e-08, "loss": 0.0005, "num_tokens": 2900163.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9858 }, { "clip_ratio/high_max": 0.00909090880304575, "clip_ratio/high_mean": 0.00909090880304575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00909090880304575, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 182.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.135387420654297, "kl": 0.06301749683916569, "learning_rate": 4.733333333333334e-08, "loss": -0.0035, "num_tokens": 2900487.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 9859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 182.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04674152284860611, "kl": 0.009944313438609242, "learning_rate": 4.7e-08, "loss": 0.0005, "num_tokens": 2900785.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 182.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 7.004546642303467, "kl": 0.05565602611750364, "learning_rate": 4.666666666666667e-08, "loss": 0.251, "num_tokens": 2901072.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 182.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04640190303325653, "kl": 0.0042971475049853325, "learning_rate": 4.633333333333334e-08, "loss": 0.0002, "num_tokens": 2901372.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 182.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00401738565415144, "kl": 0.00024271011352539062, "learning_rate": 4.6e-08, "loss": 0.0, "num_tokens": 2901616.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 182.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.005986438598483801, "kl": 0.003446533199166879, "learning_rate": 4.5666666666666665e-08, "loss": 0.0002, "num_tokens": 2901874.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 182.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.054798975586891174, "kl": 0.013443166855722666, "learning_rate": 4.5333333333333336e-08, "loss": 0.0007, "num_tokens": 2902201.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 182.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.616187572479248, "kl": 0.1407563267275691, "learning_rate": 4.5e-08, "loss": -0.1066, "num_tokens": 2902499.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07756587862968445, "kl": 0.001069672405719757, "learning_rate": 4.466666666666667e-08, "loss": 0.0001, "num_tokens": 2902711.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 4.326424598693848, "kl": 0.01726543391123414, "learning_rate": 4.433333333333334e-08, "loss": 0.0007, "num_tokens": 2902983.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 182.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.11409106105566025, "kl": 0.01539933169260621, "learning_rate": 4.4000000000000004e-08, "loss": 0.0008, "num_tokens": 2903307.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 3.8136760849738494e-05, "kl": 1.691281795501709e-06, "learning_rate": 4.366666666666667e-08, "loss": 0.0, "num_tokens": 2903527.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 182.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.0188217163085938, "kl": 0.0944785475730896, "learning_rate": 4.333333333333333e-08, "loss": -0.0308, "num_tokens": 2903888.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 182.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01453560683876276, "kl": 0.07380939088761806, "learning_rate": 4.3e-08, "loss": 0.0037, "num_tokens": 2904258.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 182.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.7986859083175659, "kl": 0.5325427949428558, "learning_rate": 4.266666666666667e-08, "loss": 0.04, "num_tokens": 2904563.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 182.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05519038438796997, "kl": 0.00658091323566623, "learning_rate": 4.2333333333333335e-08, "loss": 0.0002, "num_tokens": 2904879.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 182.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 0.9498788714408875, "kl": 0.05868354067206383, "learning_rate": 4.2e-08, "loss": 0.0032, "num_tokens": 2905223.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 182.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 7.024899005889893, "kl": 0.04051691293716431, "learning_rate": 4.166666666666666e-08, "loss": 0.0039, "num_tokens": 2905525.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 9876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 182.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 2.4432549476623535, "kl": 0.016404787078499794, "learning_rate": 4.133333333333333e-08, "loss": 0.0299, "num_tokens": 2905839.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.026123646646738052, "kl": 0.012449371162801981, "learning_rate": 4.1e-08, "loss": 0.0007, "num_tokens": 2906111.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.001900079776532948, "kl": 0.003551207482814789, "learning_rate": 4.066666666666667e-08, "loss": 0.0002, "num_tokens": 2906347.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 182.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.014301084913313389, "kl": 0.0002517402172088623, "learning_rate": 4.033333333333334e-08, "loss": 0.0, "num_tokens": 2906551.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 182.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.8679953813552856, "kl": 0.01237863814458251, "learning_rate": 4e-08, "loss": -0.0753, "num_tokens": 2906908.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 9881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 183.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.11042208224534988, "kl": 0.012875689659267664, "learning_rate": 3.9666666666666665e-08, "loss": 0.0007, "num_tokens": 2907198.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 183.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.059440977871418, "kl": 0.03435606695711613, "learning_rate": 3.9333333333333335e-08, "loss": 0.0017, "num_tokens": 2907603.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046914175618439913, "kl": 1.689046621322632e-05, "learning_rate": 3.9e-08, "loss": 0.0, "num_tokens": 2907815.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 183.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.04074396565556526, "kl": 0.002172502805478871, "learning_rate": 3.866666666666667e-08, "loss": 0.0001, "num_tokens": 2908075.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 183.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.026256661862134933, "kl": 0.0010736336407717317, "learning_rate": 3.833333333333334e-08, "loss": 0.0001, "num_tokens": 2908307.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 183.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.038920480757951736, "kl": 0.002573552686953917, "learning_rate": 3.8e-08, "loss": 0.0001, "num_tokens": 2908567.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 183.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.539245128631592, "kl": 0.0691833607852459, "learning_rate": 3.7666666666666666e-08, "loss": 0.1144, "num_tokens": 2908920.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 183.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.044224534183740616, "kl": 0.01291979430243373, "learning_rate": 3.733333333333333e-08, "loss": 0.0007, "num_tokens": 2909194.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 183.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.609105110168457, "kl": 0.07271228171885014, "learning_rate": 3.7e-08, "loss": 0.0065, "num_tokens": 2909498.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 183.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.3273508548736572, "kl": 0.01727142045274377, "learning_rate": 3.666666666666667e-08, "loss": 0.1236, "num_tokens": 2909841.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 183.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03298269957304001, "kl": 0.008421921404078603, "learning_rate": 3.6333333333333334e-08, "loss": 0.0004, "num_tokens": 2910169.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 183.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.074802465736866, "kl": 0.027217524126172066, "learning_rate": 3.6000000000000005e-08, "loss": 0.0013, "num_tokens": 2910515.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 183.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03195382282137871, "kl": 0.000666402280330658, "learning_rate": 3.566666666666667e-08, "loss": 0.0, "num_tokens": 2910725.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 183.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.08096520602703094, "kl": 0.020325182005763054, "learning_rate": 3.533333333333333e-08, "loss": 0.001, "num_tokens": 2910997.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 183.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.029553748667240143, "kl": 0.004050072107929736, "learning_rate": 3.5e-08, "loss": 0.0002, "num_tokens": 2911265.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 4.620262622833252, "kl": 0.5815833956003189, "learning_rate": 3.4666666666666666e-08, "loss": 0.0426, "num_tokens": 2911508.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 8.405125299759675e-06, "kl": 1.862645149230957e-06, "learning_rate": 3.4333333333333336e-08, "loss": 0.0, "num_tokens": 2911728.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 183.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07107184827327728, "kl": 0.006637036451138556, "learning_rate": 3.4e-08, "loss": 0.0003, "num_tokens": 2912000.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 183.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.9404093027114868, "kl": 0.2910590171813965, "learning_rate": 3.3666666666666664e-08, "loss": -0.0842, "num_tokens": 2912366.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 9900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 183.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 1.4035552740097046, "kl": 0.1563080116175115, "learning_rate": 3.3333333333333334e-08, "loss": 0.0085, "num_tokens": 2912664.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 183.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.016459928825497627, "kl": 0.0008407303830608726, "learning_rate": 3.3e-08, "loss": 0.0, "num_tokens": 2912946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 183.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.028843404725193977, "kl": 0.00676688551902771, "learning_rate": 3.266666666666667e-08, "loss": 0.0003, "num_tokens": 2913218.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 183.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.01898745447397232, "kl": 0.0006910534575581551, "learning_rate": 3.233333333333334e-08, "loss": 0.0, "num_tokens": 2913530.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 183.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.4804060459136963, "kl": 0.1441753190010786, "learning_rate": 3.2e-08, "loss": -0.014, "num_tokens": 2913887.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 183.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 4.594256401062012, "kl": 0.042867622105404735, "learning_rate": 3.1666666666666666e-08, "loss": 0.0058, "num_tokens": 2914169.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 183.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.631552696228027, "kl": 0.03864024020731449, "learning_rate": 3.133333333333333e-08, "loss": -0.0338, "num_tokens": 2914516.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 183.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07126500457525253, "kl": 0.01517056580632925, "learning_rate": 3.1e-08, "loss": 0.0007, "num_tokens": 2914819.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 183.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07337877154350281, "kl": 0.016658049076795578, "learning_rate": 3.066666666666667e-08, "loss": 0.001, "num_tokens": 2915101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 183.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.11530859768390656, "kl": 0.032312868162989616, "learning_rate": 3.0333333333333334e-08, "loss": 0.0016, "num_tokens": 2915415.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 183.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.6236201524734497, "kl": 0.07196260988712311, "learning_rate": 3.0000000000000004e-08, "loss": 0.0035, "num_tokens": 2915660.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 183.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.07097512483596802, "kl": 0.002381057245656848, "learning_rate": 2.9666666666666664e-08, "loss": 0.0001, "num_tokens": 2915974.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 183.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0489969365298748, "kl": 0.005790283539681695, "learning_rate": 2.9333333333333335e-08, "loss": 0.0003, "num_tokens": 2916290.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 183.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.644816517829895, "kl": 0.06182416994124651, "learning_rate": 2.9e-08, "loss": 0.0032, "num_tokens": 2916564.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 183.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034353183582425117, "kl": 0.0003933049738407135, "learning_rate": 2.8666666666666665e-08, "loss": 0.0, "num_tokens": 2916824.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 183.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 1.3449805974960327, "kl": 0.4834003150463104, "learning_rate": 2.8333333333333336e-08, "loss": 0.0238, "num_tokens": 2917127.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 183.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.021869849413633347, "kl": 0.0009674280881881714, "learning_rate": 2.8000000000000003e-08, "loss": 0.0, "num_tokens": 2917339.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.005327875725924969, "kl": 0.0014873594045639038, "learning_rate": 2.7666666666666666e-08, "loss": 0.0001, "num_tokens": 2917555.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 183.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.2582266330718994, "kl": 0.003645677206804976, "learning_rate": 2.7333333333333337e-08, "loss": 0.0022, "num_tokens": 2917886.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 183.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0511690117418766, "kl": 0.0010033586295321584, "learning_rate": 2.7e-08, "loss": 0.0001, "num_tokens": 2918143.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 183.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03563246876001358, "kl": 0.0030860661063343287, "learning_rate": 2.6666666666666667e-08, "loss": 0.0002, "num_tokens": 2918431.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 183.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.005469589959830046, "kl": 0.00027230083651375026, "learning_rate": 2.6333333333333338e-08, "loss": 0.0, "num_tokens": 2918691.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 183.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04079728201031685, "kl": 0.008279956877231598, "learning_rate": 2.5999999999999998e-08, "loss": 0.0004, "num_tokens": 2918981.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 183.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.9211490154266357, "kl": 0.23908530501648784, "learning_rate": 2.5666666666666668e-08, "loss": -0.0115, "num_tokens": 2919250.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 9924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 183.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.000467156118247658, "kl": 0.0013129416620358825, "learning_rate": 2.5333333333333332e-08, "loss": 0.0001, "num_tokens": 2919527.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 183.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01972462236881256, "kl": 0.0071532020810991526, "learning_rate": 2.5e-08, "loss": 0.0004, "num_tokens": 2919823.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 183.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.018887687474489212, "kl": 0.0007814254495315254, "learning_rate": 2.466666666666667e-08, "loss": 0.0, "num_tokens": 2920103.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 183.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02301059663295746, "kl": 0.09546614065766335, "learning_rate": 2.4333333333333333e-08, "loss": 0.0048, "num_tokens": 2920476.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 183.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.04622969776391983, "kl": 0.006424385355785489, "learning_rate": 2.4e-08, "loss": 0.0003, "num_tokens": 2920810.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 183.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.12056931853294373, "kl": 0.06816163286566734, "learning_rate": 2.366666666666667e-08, "loss": 0.0035, "num_tokens": 2921184.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 183.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.07800615578889847, "kl": 0.03056582622230053, "learning_rate": 2.3333333333333334e-08, "loss": 0.0015, "num_tokens": 2921486.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 183.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.035695284605026245, "kl": 0.0036770704900845885, "learning_rate": 2.3e-08, "loss": 0.0002, "num_tokens": 2921820.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.21672280132770538, "kl": 0.016340750502422452, "learning_rate": 2.2666666666666668e-08, "loss": 0.0011, "num_tokens": 2922043.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 183.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.037876784801483154, "kl": 0.00466212525498122, "learning_rate": 2.2333333333333335e-08, "loss": 0.0002, "num_tokens": 2922334.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 183.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008505430072546005, "kl": 0.1598047837615013, "learning_rate": 2.2000000000000002e-08, "loss": 0.008, "num_tokens": 2922644.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0425378791987896, "kl": 0.00784134236164391, "learning_rate": 2.1666666666666665e-08, "loss": 0.0004, "num_tokens": 2922931.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 184.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028524266090244055, "kl": 0.00020351459534140304, "learning_rate": 2.1333333333333336e-08, "loss": 0.0, "num_tokens": 2923195.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 184.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.026593690738081932, "kl": 0.0007211466581793502, "learning_rate": 2.1e-08, "loss": 0.0, "num_tokens": 2923415.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 184.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 9.173880243906751e-05, "kl": 2.6598572731018066e-06, "learning_rate": 2.0666666666666666e-08, "loss": 0.0, "num_tokens": 2923635.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04364466667175293, "kl": 0.009181763249216601, "learning_rate": 2.0333333333333333e-08, "loss": 0.0005, "num_tokens": 2923922.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 184.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0627545490860939, "kl": 0.02671348676085472, "learning_rate": 2e-08, "loss": 0.0013, "num_tokens": 2924268.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 184.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 8.020615577697754, "kl": 0.030892505426891148, "learning_rate": 1.9666666666666667e-08, "loss": 0.2241, "num_tokens": 2924514.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.034249190241098404, "kl": 0.0062084178207442164, "learning_rate": 1.9333333333333334e-08, "loss": 0.0003, "num_tokens": 2924818.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 184.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 6.559693336486816, "kl": 0.06395634077489376, "learning_rate": 1.9e-08, "loss": 0.0509, "num_tokens": 2925093.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 184.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0060663470067083836, "kl": 0.00030325717671075836, "learning_rate": 1.8666666666666665e-08, "loss": 0.0, "num_tokens": 2925404.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 184.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.20037707686424255, "kl": 0.01274197647580877, "learning_rate": 1.8333333333333335e-08, "loss": 0.0007, "num_tokens": 2925730.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 184.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.004123511258512735, "kl": 0.00024154037237167358, "learning_rate": 1.8000000000000002e-08, "loss": 0.0, "num_tokens": 2925974.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 184.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03306548669934273, "kl": 0.009320731740444899, "learning_rate": 1.7666666666666666e-08, "loss": 0.0005, "num_tokens": 2926297.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 184.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04611361399292946, "kl": 0.002875296981073916, "learning_rate": 1.7333333333333333e-08, "loss": 0.0001, "num_tokens": 2926569.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 184.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.039583124220371246, "kl": 0.006974290125072002, "learning_rate": 1.7e-08, "loss": 0.0003, "num_tokens": 2926860.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 184.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.023145070299506187, "kl": 0.09695570915937424, "learning_rate": 1.6666666666666667e-08, "loss": 0.0048, "num_tokens": 2927232.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 184.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04906821250915527, "kl": 0.0006284117553150281, "learning_rate": 1.6333333333333334e-08, "loss": 0.0, "num_tokens": 2927488.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 184.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09616069495677948, "kl": 0.01611461688298732, "learning_rate": 1.6e-08, "loss": 0.001, "num_tokens": 2927807.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 184.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.1833784133195877, "kl": 0.05304853431880474, "learning_rate": 1.5666666666666665e-08, "loss": 0.0024, "num_tokens": 2928194.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 184.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033897364046424627, "kl": 0.00013303756713867188, "learning_rate": 1.5333333333333335e-08, "loss": 0.0, "num_tokens": 2928414.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 184.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.1685088872909546, "kl": 0.17326711118221283, "learning_rate": 1.5000000000000002e-08, "loss": 0.0087, "num_tokens": 2928725.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 184.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.5447914600372314, "kl": 0.24420192709658295, "learning_rate": 1.4666666666666667e-08, "loss": -0.0272, "num_tokens": 2929074.0, "reward": 5.0, "reward_std": 3.5590262413024902, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.5590262413024902, "step": 9957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 184.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04683195427060127, "kl": 0.0038846245734021068, "learning_rate": 1.4333333333333333e-08, "loss": 0.0002, "num_tokens": 2929346.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.12173474580049515, "kl": 0.02009764825925231, "learning_rate": 1.4000000000000001e-08, "loss": 0.0012, "num_tokens": 2929628.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 184.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03806106001138687, "kl": 0.0033851079642772675, "learning_rate": 1.3666666666666668e-08, "loss": 0.0002, "num_tokens": 2929940.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 184.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02742687612771988, "kl": 0.005892345157917589, "learning_rate": 1.3333333333333334e-08, "loss": 0.0003, "num_tokens": 2930228.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 184.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044488986022770405, "kl": 2.4691224098205566e-05, "learning_rate": 1.2999999999999999e-08, "loss": 0.0, "num_tokens": 2930440.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 184.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.013761185109615326, "kl": 0.002048698952421546, "learning_rate": 1.2666666666666666e-08, "loss": 0.0001, "num_tokens": 2930790.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 184.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.018396787345409393, "kl": 0.012444132007658482, "learning_rate": 1.2333333333333335e-08, "loss": 0.0006, "num_tokens": 2931050.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 184.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01640355959534645, "kl": 0.2655911296606064, "learning_rate": 1.2e-08, "loss": 0.0133, "num_tokens": 2931354.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 184.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0358988493680954, "kl": 0.0027944179018959403, "learning_rate": 1.1666666666666667e-08, "loss": 0.0001, "num_tokens": 2931655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 184.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 7.697304725646973, "kl": 0.0037406296469271183, "learning_rate": 1.1333333333333334e-08, "loss": 0.3122, "num_tokens": 2931878.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.062496013939380646, "kl": 0.003379064262844622, "learning_rate": 1.1000000000000001e-08, "loss": 0.0002, "num_tokens": 2932145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 184.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 6.009119510650635, "kl": 0.11926095932722092, "learning_rate": 1.0666666666666668e-08, "loss": 0.1536, "num_tokens": 2932465.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 184.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.08952630311250687, "kl": 0.006811510305851698, "learning_rate": 1.0333333333333333e-08, "loss": 0.0004, "num_tokens": 2932695.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 184.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.000314667442580685, "kl": 0.001240525976754725, "learning_rate": 1e-08, "loss": 0.0001, "num_tokens": 2932975.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 184.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.0069992542266846, "kl": 0.07766726985573769, "learning_rate": 9.666666666666667e-09, "loss": 0.0135, "num_tokens": 2933341.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 184.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.6369693279266357, "kl": 0.03867245092988014, "learning_rate": 9.333333333333333e-09, "loss": 0.1959, "num_tokens": 2933627.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 184.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.001838079420849681, "kl": 0.0005452297627925873, "learning_rate": 9.000000000000001e-09, "loss": 0.0, "num_tokens": 2933887.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 184.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.707056522369385, "kl": 0.07085501775145531, "learning_rate": 8.666666666666667e-09, "loss": 0.0259, "num_tokens": 2934206.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 184.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 1.6058717966079712, "kl": 0.06466163269942626, "learning_rate": 8.333333333333334e-09, "loss": 0.0041, "num_tokens": 2934486.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 184.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018751665484160185, "kl": 0.003562338650226593, "learning_rate": 8e-09, "loss": 0.0002, "num_tokens": 2934722.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.11398318409919739, "kl": 0.042106447741389275, "learning_rate": 7.666666666666667e-09, "loss": 0.0022, "num_tokens": 2935017.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 184.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.016550902277231216, "kl": 0.0007434528088197112, "learning_rate": 7.333333333333334e-09, "loss": 0.0, "num_tokens": 2935339.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 184.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.5287065505981445, "kl": 0.07356359669938684, "learning_rate": 7.000000000000001e-09, "loss": 0.1737, "num_tokens": 2935657.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 9980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 184.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02500924840569496, "kl": 0.006159370765089989, "learning_rate": 6.666666666666667e-09, "loss": 0.0003, "num_tokens": 2936005.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 184.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.5838184356689453, "kl": 0.13607493788003922, "learning_rate": 6.333333333333333e-09, "loss": 0.0149, "num_tokens": 2936379.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 9982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 184.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.08272148668766022, "kl": 0.007379991700872779, "learning_rate": 6e-09, "loss": 0.0004, "num_tokens": 2936647.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.15359057486057281, "kl": 0.018743189051747322, "learning_rate": 5.666666666666667e-09, "loss": 0.0011, "num_tokens": 2936935.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 184.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.02734927274286747, "kl": 0.05121681094169617, "learning_rate": 5.333333333333334e-09, "loss": 0.0025, "num_tokens": 2937271.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 184.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.17441505193710327, "kl": 0.034458561800420284, "learning_rate": 5e-09, "loss": 0.0017, "num_tokens": 2937567.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 184.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02018146589398384, "kl": 0.00628455744299572, "learning_rate": 4.666666666666666e-09, "loss": 0.0003, "num_tokens": 2937839.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 184.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.051395975053310394, "kl": 0.002733725297730416, "learning_rate": 4.333333333333333e-09, "loss": 0.0002, "num_tokens": 2938093.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 184.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03161874786019325, "kl": 0.0021529156947508454, "learning_rate": 4e-09, "loss": 0.0001, "num_tokens": 2938389.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 185.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.5798838138580322, "kl": 0.04691888391971588, "learning_rate": 3.666666666666667e-09, "loss": 0.0062, "num_tokens": 2938772.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 185.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 8.500663757324219, "kl": 0.16840357484761626, "learning_rate": 3.3333333333333334e-09, "loss": 0.0838, "num_tokens": 2939035.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 185.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.018531130626797676, "kl": 0.0008435696363449097, "learning_rate": 3e-09, "loss": 0.0, "num_tokens": 2939247.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 185.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 3.4365555620752275e-05, "kl": 2.086162567138672e-06, "learning_rate": 2.666666666666667e-09, "loss": 0.0, "num_tokens": 2939467.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 185.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07230444997549057, "kl": 0.011429775040596724, "learning_rate": 2.333333333333333e-09, "loss": 0.0006, "num_tokens": 2939762.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 185.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.12053240835666656, "kl": 0.014214991824701428, "learning_rate": 2e-09, "loss": 0.0007, "num_tokens": 2940055.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 185.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.8724961280822754, "kl": 0.19114062655717134, "learning_rate": 1.6666666666666667e-09, "loss": 0.0543, "num_tokens": 2940362.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 9996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 185.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.1254260390996933, "kl": 0.009432489052414894, "learning_rate": 1.3333333333333335e-09, "loss": 0.0005, "num_tokens": 2940633.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 185.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.3686263859272003, "kl": 0.029108582995831966, "learning_rate": 1e-09, "loss": 0.0015, "num_tokens": 2940917.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 185.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.9218740463256836, "kl": 0.005491628777235746, "learning_rate": 6.666666666666667e-10, "loss": 0.0235, "num_tokens": 2941192.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 185.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.007952062413096428, "kl": 0.0004007460083812475, "learning_rate": 3.3333333333333337e-10, "loss": 0.0, "num_tokens": 2941503.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 10000 } ], "logging_steps": 1, "max_steps": 10000, "num_input_tokens_seen": 2941503, "num_train_epochs": 186, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }